def set_data(self): w=WebPage(htmldata=self.htmldata1) #find all the codes for season/year in the first html form data self.semesters={} xpath="""//*[@id="CLASS_SRCH_WRK2_STRM$35$"]/option""" for e in w.get_from_xpath(xpath): key,semester=e.text,e.get("value") if key.strip() and semester.strip(): self.semesters[key]=semester #match up season/year to the codes we just found, if possible code=0 for key in self.semesters: if self.season.lower() in key.lower() and str(self.year) in key: code=self.semesters[key] break if not code: print_color("Warning: failed to find season/year in search options. season='%s' year='%s'"%(self.season,self.year),COLORS.RED) print_d("search options",self.semesters) self.data={"ICFocus":"SSR_CLSRCH_WRK_ACAD_CAREER$2", "CLASS_SRCH_WRK2_STRM$35$":str(code), "SSR_CLSRCH_WRK_SUBJECT$0":self.department, "ICAction":"CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH", "SSR_CLSRCH_WRK_ACAD_CAREER$2":self.level}
def crawl(self): while len(self.url_queue) > 0 and len( self.discovered) <= self.MAX_LINKS_TO_VISIT: url = self.url_queue.popleft() if 'DEBUG' in os.environ: print "Queue Size:", len(self.url_queue) print "Fetching: ", url webpage = WebPage(url) self.unvisited[url] = False all_links = webpage.get_anchors( False) # False: dont keep fragments all_assets = webpage.get_assets() self.assets.append({'url': url, 'assets': all_assets}) for link in all_links: # if belongs to same domain & is not already discovered if self.same_domain_rule.matches( link) and self.discovered[link.geturl()] is None: self.discovered[link.geturl()] = True # process if not already in the queue if self.unvisited[link.geturl()] is None: self.url_queue.append(link.geturl()) self.unvisited[link.geturl()] = True
def Crawl(address, recursions=2, txt=""): ''' gathers 5 random links from given website repeats recursively to each of the links that were found ''' links_amount = 0 max_links_amount = 5 web_page = WebPage(address) links = web_page.get_links(only_external_links_allowed=True) #print("Links on page: {} found: {}".format(address, len(links))) if recursions > 0: recursions -= 1 for link in links: if links_amount >= max_links_amount: break if link not in words_list.keys(): try: Crawl(link, recursions, txt) all_text = web_page.get_all_text() words_parser = GetWords(all_text) words_list[link] = words_parser.top_words(5) links_amount += 1 except (TooFewLinksOnPage, BannedExtensionPage, ConnectionError): #print("Getting page from previous round...") continue
def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links
def start(self): while 1: try: url = self.dbop.pop_url() print "url: %s" % url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html != None: self.webpage = WebPage(url, html) article = self.webpage.extract() if len(article) > 5: addtime = "%s %s" % (article[1], article[2]) self.dbop.html2db(url, html, article[0], addtime, article[3], article[5]) else: self.dbop.html2db(url, html) print self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) links = self.webpage.filter_links(tags=['a'], str_patterns=ruptn) self.add_seeds(links) self.mysleep(3) except Exception, err: print "!!error!! Exception happend! %s %s" % (url, err) self.dbop.close()
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_scriptFile = args.script self.m_args = args.script_args self.m_filesystem = FileSystem(self) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect( self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings[ 'localAccessRemote'] = args.local_access_remote self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) self.m_page.mainFrame().addToJavaScriptWindowObject( 'fs', self.m_filesystem) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): sys.exit('Can not bootstrap!') bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: sys.exit('Can not bootstrap!') self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost')
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
def __init__(self, settings): super(Browser, self).__init__() self._settings = settings self._cookie_jar = CookieJar() self._network_access_manager = NetworkAccessManager(self._cookie_jar) self._web_page = WebPage(settings) self._web_page.setNetworkAccessManager(self._network_access_manager) self._web_view = WebView(settings) self._web_view.setPage(self._web_page)
class Crawler(): def __init__(self): self.downloader = DownloadManager() self.webpage = None self.rules = {} self.dbop = OperatorDB() def add_seeds(self, links): self.dbop.add_seeds(links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def start(self): while 1: try: url = self.dbop.pop_url() print "url: %s" % url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html != None: self.webpage = WebPage(url, html) article = self.webpage.extract() if len(article) > 5: addtime = "%s %s" % (article[1], article[2]) self.dbop.html2db(url, html, article[0], addtime, article[3], article[5]) else: self.dbop.html2db(url, html) print self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) links = self.webpage.filter_links(tags=['a'], str_patterns=ruptn) self.add_seeds(links) self.mysleep(3) except Exception, err: print "!!error!! Exception happend! %s %s" % (url, err) self.dbop.close()
def updateView(self): page = WebPage(logger=None, parent=self) page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks) page.mainFrame().addToJavaScriptWindowObject("qtWindow", self) self.ui.webView.setPage(page) html = self.generateHtml() # baseUrl must end with a trailing '/' otherwise QWebView won't be able # to load files from there baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/")) self.ui.webView.setHtml(html, baseUrl)
def __init__(self, parent, args): super(Phantom, self).__init__(parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_servers = [] self.m_verbose = args.verbose self.m_page = WebPage(self, args) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.app_args = args self.m_scriptFile = args.script self.m_args = args.script_args self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8') self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if args.proxy is None: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(args.proxy_type, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.javaScriptConsoleMessageSent.connect( self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['javascriptEnabled'] = True self.m_defaultPageSettings['XSSAuditingEnabled'] = False self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings[ 'localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) with QPyFile(':/bootstrap.js') as f: self.m_page.mainFrame().evaluateJavaScript(f.readAll()) do_action('PhantomInitPost')
def __init__(self, profile: QWebEngineProfile): super().__init__() self.m_addressBar = UrlLineEdit(self) self.m_view = WebView(self) self.setAttribute(Qt.WA_DeleteOnClose) self.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Minimum) layout = QVBoxLayout() # layout.setMargin(0) self.setLayout(layout) layout.addWidget(self.m_addressBar) layout.addWidget(self.m_view) self.m_view.setPage(WebPage(profile, self.m_view)) self.m_view.setFocus() self.m_addressBar.setReadOnly(True) self.m_addressBar.setFavIcon(QIcon(":defaulticon.png")) self.m_view.titleChanged.connect(self.setWindowTitle) self.m_view.urlChanged.connect(self.setUrl) self.m_view.page().iconChanged.connect(self.handleIconChanged) self.m_view.page().geometryChangeRequested.connect( self.handleGeometryChangeRequested) self.m_view.page().windowCloseRequested.connect(self.close)
def __init__(self): logging.debug("-->") super(WebBrowser, self).__init__() self.app = QApplication.instance() if self.app is None: self.app = QApplication(sys.argv) self.app.setQuitOnLastWindowClosed(False) self.event_loop = QEventLoop() self.cookie_jar = CookieJar() self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888) self.network_manager = NetworkAccessManager() self.network_manager.setCookieJar(self.cookie_jar) # self.network_manager.setProxy(self.proxy) self.web_page = WebPage() self.web_page.setNetworkAccessManager(self.network_manager) self.web_view = QWebView() self.web_view.setPage(self.web_page) self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False) self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True) self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True) # self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False) self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished) self.page_loaded_validator = None self.page_loaded_handler = None self.page_loaded_handler_kwargs = None self.timeout_message = None self.timer = None self.event_loop_exception = None logging.debug("<--")
def getlinks(self,url,html): self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) return links
def __init__(self, parent, url=''): super(WebView, self).__init__(parent) self.draging = False self.drag = QDrag(self) self.dragStartPos = None self.webPage = WebPage() self.setPage(self.webPage) self.mainFrame = self.page().mainFrame() self.setAttribute(Qt.WA_DeleteOnClose, True) self.titleChanged.connect(parent.setWindowTitle) self.load(url) webSettings = self.settings() webSettings.setDefaultTextEncoding("utf-8") # webSettings.setOfflineStorageDefaultQuota(sys.maxsize) # webSettings.setOfflineWebApplicationCacheQuota(sys.maxsize) webSettings.enablePersistentStorage(assets.fs.dataPath()) webSettings.setAttribute(QWebSettings.PluginsEnabled, True) webSettings.setAttribute(QWebSettings.DnsPrefetchEnabled, True) webSettings.setAttribute(QWebSettings.XSSAuditingEnabled, True) webSettings.setAttribute(QWebSettings.CSSGridLayoutEnabled, True) webSettings.setAttribute(QWebSettings.ScrollAnimatorEnabled, True) webSettings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True) webSettings.setAttribute(QWebSettings.JavascriptCanOpenWindows, True) webSettings.setAttribute(QWebSettings.JavascriptCanCloseWindows, True) webSettings.setAttribute(QWebSettings.JavascriptCanAccessClipboard, True) webSettings.setAttribute(QWebSettings.LocalContentCanAccessFileUrls, True) webSettings.setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) self.mainFrame.javaScriptWindowObjectCleared.connect( self.setJavaScriptObject) self.mainFrame.iconChanged.connect(self.changeIcon)
def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) # print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) print links self.add_seeds(links) file_pattern = [] file_pattern.append(re.compile(self.file_rule)) files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern) self.files.append(files) #TODO: self.download_files(files) print files
def test_file_links(self): self.start_server(TestWebPage.FILE_LINKS_HTML) webpage = WebPage(TestWebPage.SERVER) self.assertEqual(0, len(webpage.get_js())) self.assertEqual(0, len(webpage.get_stylesheets())) self.assertEqual(0, len(webpage.get_links())) self.assertEqual(2, len(webpage.get_anchors())) self.assertEqual(0, len(webpage.get_images())) self.assertEqual(2, len(webpage.get_files()))
def __init__(self): super(Window, self).__init__() self.view = QWebView(self) self.view.setPage(WebPage()) layout = QVBoxLayout(self) layout.setMargin(0) layout.addWidget(self.view)
def createWebPage(self): page = WebPage(self) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page
def createTab(self, makeCurrent: bool = True) -> WebView: webView = WebView() webPage = WebPage(QWebEngineProfile.defaultProfile(), webView) webView.setPage(webPage) self.setupView(webView) self.addTab(webView, self.tr("(Untitled)")) if makeCurrent: self.setCurrentWidget(webView) return webView
def __init__(self, parent, args): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_scriptFile = args.script self.m_args = args.script_args self.m_filesystem = FileSystem(self) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if args.proxy is None: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(self, args.disk_cache, args.cookies, args.ignore_ssl_errors) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['javascriptEnabled'] = True self.m_defaultPageSettings['XSSAuditingEnabled'] = False self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings['localAccessRemote'] = args.local_access_remote self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) self.m_page.mainFrame().addToJavaScriptWindowObject('fs', self.m_filesystem) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): sys.exit('Can not bootstrap!') bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: sys.exit('Can not bootstrap!') self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost')
def __init__(self, args, parent = None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = '' self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = args.script.read() self.m_scriptFile = args.script.name self.m_scriptDir = os.path.dirname(args.script.name) if sys.platform.startswith('win'): self.m_scriptDir += '\\' else: self.m_scriptDir += '/' self.m_args = args.script_args self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script.close() palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) if self.m_verbose: m_netAccessMan = NetworkAccessManager(args.disk_cache, self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(self.inject) self.m_page.loadFinished.connect(self.finish)
def __init__(self, args, parent = None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = QString() self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = QString.fromUtf8(args.script[0].read()) self.m_scriptFile = args.script[0].name self.m_args = args.script[1:] self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script[0].close() palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) # if our script was called in a different directory, change to it # to make any dealings with files be relative to the scripts directory if os.path.dirname(self.m_scriptFile): os.chdir(os.path.dirname(self.m_scriptFile)) if self.m_verbose: m_netAccessMan = NetworkAccessManager(self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.connect(self.m_page.mainFrame(), SIGNAL('javaScriptWindowObjectCleared()'), self.inject) self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
def run(self): while self.status: try: url = self.spider.task_list.get(timeout = 1) except Empty: # log.info('%s: task_list Empty' % self.name) continue self.spider.increase_running() if not self.spider.check_robots(url): log.info('%s - robots forbidden : %s' % (self.name, url)) continue page = WebPage(url) # print('%s prepare to fetch %s' % (self.name, url)) if page.fetch(): self.spider.db.save_data(page.get_data()) for link in page.get_link(): # retrive links from html if link not in self.spider.visited_list: # not visited yet self.spider.extend_list.add(link) else: print('%s: Page fetch failed: %s' % (self.name, page.url)) self.spider.decrease_running()
def get_course_manager(self, semester, departments, show_html=0): htmldata = self.get_course_page(semester, departments) if show_html: self.show_html(htmldata) htmlpath = "temp.html" with open(htmlpath, "w") as f: f.write(htmldata) webpage = WebPage(htmlpath=htmlpath) cm = CourseManager(webpage, logger=self.logger) return cm
def run(self): while self.status: try: url = self.spider.task_list.get(timeout=1) except Empty: # log.info('%s: task_list Empty' % self.name) continue self.spider.increase_running() if not self.spider.check_robots(url): log.info('%s - robots forbidden : %s' % (self.name, url)) continue page = WebPage(url) # print('%s prepare to fetch %s' % (self.name, url)) if page.fetch(): self.spider.db.save_data(page.get_data()) for link in page.get_link(): # retrive links from html if link not in self.spider.visited_list: # not visited yet self.spider.extend_list.add(link) else: print('%s: Page fetch failed: %s' % (self.name, page.url)) self.spider.decrease_running()
class Browser(object): def __init__(self, settings): super(Browser, self).__init__() self._settings = settings self._cookie_jar = CookieJar() self._network_access_manager = NetworkAccessManager(self._cookie_jar) self._web_page = WebPage(settings) self._web_page.setNetworkAccessManager(self._network_access_manager) self._web_view = WebView(settings) self._web_view.setPage(self._web_page) def load_url(self, url, show_ui=True): self._web_view.load(QUrl(url)) if show_ui is True: self._web_view.show() def load_html(self, html, url, cookies="", show_ui=True): if len(cookies) > 0: self._web_page.cookie_jar.load_qt_cookie(cookies) self._web_view.setHtml(html, QUrl(url)) if show_ui is True: self._web_view.show()
def add_webpage(self): webpage = WebPage(name='', description='', url='', load_content=False) webpage.name = raw_input('Name: ') webpage.description = raw_input('Description: ') webpage.url = raw_input('URL: ') webpage.update_timeout = int(raw_input('Update timeout: ')) webpage.request_timeout = int(raw_input('Request timeout: ')) webpage.data_offset = int(raw_input('Data offset: ')) done = False while not done: confirm = raw_input('Save? (y/n)') if confirm in ['y', 'Y']: try: webpage.current = webpage.retrieve() except ValueError, e: logger.error('[!] Error: ' + str(e)) done = True self.__webpages.append(webpage) #self.start_updater() if confirm in ['y', 'Y', 'n', 'N']: done = True
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_script = args.script self.m_scriptFile = args.script_name self.m_args = args.script_args do_action('PhantomInitPre', Bunch(locals())) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_page.applySettings(self.m_defaultPageSettings) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): qCritical('Can not bootstrap!') sys.exit(1) bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: qCritical('Can not bootstrap!') sys.exit(1) self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost', Bunch(locals()))
def __init__(self, parent, args): super(Phantom, self).__init__(parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self, args) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.app_args = args self.m_scriptFile = args.script self.m_args = args.script_args self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8') self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if args.proxy is None: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['javascriptEnabled'] = True self.m_defaultPageSettings['XSSAuditingEnabled'] = False self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) with QPyFile(':/bootstrap.js') as f: bootstrap = str(f.readAll()) self.m_page.mainFrame().evaluateJavaScript(bootstrap) do_action('PhantomInitPost')
def process_web_page(thread_id, sites, q): webpages = [] for site in sites: try: w = WebPageLinkExtractor(WebPage(site)) # Discard invalid webpages for web in w.external_links: if FORBIDDEN_DOMAINS.match(web.qdn) or\ FORBIDDEN_FILETYPES.match(web.url): continue webpages.append(web) time.sleep(0.1) except: pass q.put(webpages)
def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) self.mysleep(3)
def start(self): while 1: url = self.queue.popUrl() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来 self.webpage = WebPage(url,html)#开始解析网页 self.webpage.parseLinks()#得到全部的超链接 ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None if links: self.addSeeds(links) self.mysleep(3)#休息一下再继续爬
def get_all_urls(self): """Get all building urls from all index pages. Returns: dict: {"page 1": [url1, url2, url3 ...], "page 2": [...], ...} """ all_url_list = {} page_count = 0 for page_url in self.get_all_page_urls(): single_page_urls = [] page_count += 1 building_url_list = WebPage(page_url).get_soup().findAll( 'a', {"se:clickable:target": "true"}) for building in building_url_list: building_url = "https://streeteasy.com" + building.get("href") single_page_urls.append(building_url) IndexPage.url_count += 1 all_url_list[f"page {page_count}"] = single_page_urls print(f"Retrived all url on page {page_count}") print("=============================") print(f"RETRIVED ALL URLS FOR {page_count} PAGES") print(f"TOTAL URLS IDENTIFIED: ", IndexPage.url_count) return all_url_list
class Phantom(QObject): def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_scriptFile = args.script self.m_args = args.script_args do_action('PhantomInitPre', Bunch(locals())) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): sys.exit('Can not bootstrap!') bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: sys.exit('Can not bootstrap!') self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost', Bunch(locals())) def execute(self): injectJsInFrame(self.m_scriptFile, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True) return not self.m_terminated def printConsoleMessage(self, message, lineNumber, source): if source: message = '%s:%d %s' % (source, lineNumber, message) print message def returnValue(self): return self.m_returnValue ## # Properties and methods exposed to JavaScript ## @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtSlot(result=WebPage) def createWebPage(self): page = WebPage(self) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page @pyqtProperty('QVariantMap') def defaultPageSettings(self): return self.m_defaultPageSettings @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_terminated = True self.m_returnValue = code # stop javascript execution; delete C++ object first, # then delete the Python reference sip.delete(self.m_page) del self.m_page QApplication.instance().exit(code) @pyqtSlot(str, result=bool) def injectJs(self, filePath): return injectJsInFrame(filePath, self.libraryPath, self.m_page.mainFrame()) @pyqtProperty(str) def libraryPath(self): return self.m_page.libraryPath @libraryPath.setter def libraryPath(self, dirPath): self.m_page.libraryPath = dirPath @pyqtProperty(str) def scriptName(self): return os.path.basename(self.m_scriptFile) @pyqtProperty('QVariantMap') def version(self): version = { 'major': version_major, 'minor': version_minor, 'patch': version_patch } return version do_action('Phantom', Bunch(locals()))
class Phantom(QObject): def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = '' self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = args.script.read() self.m_scriptFile = args.script.name self.m_scriptDir = os.path.dirname(args.script.name) + '/' self.m_args = args.script_args self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script.close() do_action('PhantomInitPre', Bunch(locals())) palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(self.inject) self.m_page.loadFinished.connect(self.finish) do_action('PhantomInitPost', Bunch(locals())) def execute(self): if self.m_script.startswith('#!'): self.m_script = '//' + self.m_script if self.m_scriptFile.lower().endswith('.coffee'): coffee = CSConverter(self) self.m_script = coffee.convert(self.m_script) self.m_page.mainFrame().evaluateJavaScript(self.m_script) def finish(self, success): self.m_loadStatus = 'success' if success else 'fail' self.m_page.mainFrame().evaluateJavaScript(self.m_script) def inject(self): self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) def renderPdf(self, fileName): p = QPrinter() p.setOutputFormat(QPrinter.PdfFormat) p.setOutputFileName(fileName) p.setResolution(pdf_dpi) paperSize = self.m_paperSize if not len(paperSize): pageSize = QSize(self.m_page.mainFrame().contentsSize()) paperSize['width'] = str(pageSize.width()) + 'px' paperSize['height'] = str(pageSize.height()) + 'px' paperSize['border'] = '0px' if paperSize.get('width') and paperSize.get('height'): sizePt = QSizeF(ceil(self.stringToPointSize(paperSize['width'])), ceil(self.stringToPointSize(paperSize['height']))) p.setPaperSize(sizePt, QPrinter.Point) elif 'format' in paperSize: orientation = QPrinter.Landscape if paperSize.get('orientation') and paperSize['orientation'].lower() == 'landscape' else QPrinter.Portrait orientation = QPrinter.Orientation(orientation) p.setOrientation(orientation) formats = { 'A3': QPrinter.A3, 'A4': QPrinter.A4, 'A5': QPrinter.A5, 'Legal': QPrinter.Legal, 'Letter': QPrinter.Letter, 'Tabloid': QPrinter.Tabloid } p.setPaperSize(QPrinter.A4) # fallback for format, size in formats.items(): if format.lower() == paperSize['format'].lower(): p.setPaperSize(size) break else: return False border = floor(self.stringToPointSize(paperSize['border'])) if paperSize.get('border') else 0 p.setPageMargins(border, border, border, border, QPrinter.Point) self.m_page.mainFrame().print_(p) return True def returnValue(self): return self.m_returnValue def stringToPointSize(self, string): units = ( ('mm', 72 / 25.4), ('cm', 72 / 2.54), ('in', 72.0), ('px', 72.0 / pdf_dpi / 2.54), ('', 72.0 / pdf_dpi / 2.54) ) for unit, format in units: if string.endswith(unit): value = string.rstrip(unit) return float(value) * format return 0 ## # Properties and methods exposed to JavaScript ## @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtProperty('QVariantMap') def clipRect(self): result = { 'width': self.m_clipRect.width(), 'height': self.m_clipRect.height(), 'top': self.m_clipRect.top(), 'left': self.m_clipRect.left() } return result @clipRect.setter def clipRect(self, size): names = ('width', 'height', 'top', 'left') for item in names: try: globals()[item] = int(size[item]) if globals()[item] < 0: if item not in ('top', 'left'): globals()[item] = 0 except KeyError: globals()[item] = getattr(self.m_clipRect, item)() self.m_clipRect = QRect(left, top, width, height) @pyqtProperty(str) def content(self): return self.m_page.mainFrame().toHtml() @content.setter def content(self, content): self.m_page.mainFrame().setHtml(content) @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_returnValue = code self.m_page.loadFinished.disconnect(self.finish) QTimer.singleShot(0, qApp, SLOT('quit()')) @pyqtProperty(str) def loadStatus(self): return self.m_loadStatus @pyqtSlot(str, result=bool) def loadScript(self, script): if script in self.m_loadScript_cache: self.m_page.mainFrame().evaluateJavaScript(self.m_loadScript_cache[script]) return True scriptFile = script try: script = codecs.open(self.m_scriptDir + script, encoding='utf-8') script = script.read() except IOError: return False if script.startswith('#!'): script = '//' + script if scriptFile.lower().endswith('.coffee'): coffee = CSConverter(self) script = coffee.convert(script) self.m_loadScript_cache[scriptFile] = script self.m_page.mainFrame().evaluateJavaScript(script) return True @pyqtSlot(str, name='open') def open_(self, address): qDebug('Opening address %s' % address) self.m_page.triggerAction(QWebPage.Stop) self.m_loadStatus = 'loading' self.m_page.mainFrame().setUrl(QUrl(address)) @pyqtProperty('QVariantMap') def paperSize(self): return self.m_paperSize @paperSize.setter def paperSize(self, size): self.m_paperSize = size @pyqtSlot(str, result=bool) def render(self, fileName): fileInfo = QFileInfo(fileName) path = QDir() path.mkpath(fileInfo.absolutePath()) if fileName.lower().endswith('.pdf'): return self.renderPdf(fileName) viewportSize = QSize(self.m_page.viewportSize()) pageSize = QSize(self.m_page.mainFrame().contentsSize()) bufferSize = QSize() if not self.m_clipRect.isEmpty(): bufferSize = self.m_clipRect.size() else: bufferSize = self.m_page.mainFrame().contentsSize() if pageSize == '': return False image = QImage(bufferSize, QImage.Format_ARGB32) image.fill(qRgba(255, 255, 255, 0)) p = QPainter(image) p.setRenderHint(QPainter.Antialiasing, True) p.setRenderHint(QPainter.TextAntialiasing, True) p.setRenderHint(QPainter.SmoothPixmapTransform, True) self.m_page.setViewportSize(pageSize) if not self.m_clipRect.isEmpty(): p.translate(-self.m_clipRect.left(), -self.m_clipRect.top()) self.m_page.mainFrame().render(p, QRegion(self.m_clipRect)) else: self.m_page.mainFrame().render(p) p.end() self.m_page.setViewportSize(viewportSize) return image.save(fileName) @pyqtSlot('QWebElement', str) def setFormInputFile(self, el, fileTag): self.m_page.m_nextFileTag = fileTag el.evaluateJavaScript('''(function(target){ var evt = document.createEvent('MouseEvents'); evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null); target.dispatchEvent(evt);})(this);''') @pyqtSlot(int) def sleep(self, ms): startTime = QTime.currentTime() while True: QApplication.processEvents(QEventLoop.AllEvents, 25) if startTime.msecsTo(QTime.currentTime()) > ms: break usleep(0.005) @pyqtProperty(str) def state(self): return self.m_state @state.setter def state(self, value): self.m_state = value @pyqtProperty(str) def userAgent(self): return self.m_page.m_userAgent @userAgent.setter def userAgent(self, ua): self.m_page.m_userAgent = ua @pyqtSlot(str, result='QVariant') @pyqtSlot(int, result='QVariant') @pyqtSlot(str, 'QVariant') @pyqtSlot(int, 'QVariant') def ctx(self, name, value=None): if not value: return self.m_var.get(name) self.m_var[name] = value @pyqtProperty('QVariantMap') def version(self): version = { 'major': version_major, 'minor': version_minor, 'patch': version_patch } return version @pyqtProperty('QVariantMap') def viewportSize(self): size = self.m_page.viewportSize() result = { 'width': size.width(), 'height': size.height() } return result @viewportSize.setter def viewportSize(self, size): names = ('width', 'height') for item in names: try: globals()[item] = int(size[item]) if globals()[item] < 0: globals()[item] = 0 except KeyError: globals()[item] = getattr(self.m_page.viewportSize(), item)() self.m_page.setViewportSize(QSize(width, height)) do_action('Phantom', Bunch(locals()))
class Phantom(QObject): def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_scriptFile = args.script self.m_args = args.script_args self.m_filesystem = FileSystem(self) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect( self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings[ 'localAccessRemote'] = args.local_access_remote self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) self.m_page.mainFrame().addToJavaScriptWindowObject( 'fs', self.m_filesystem) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): sys.exit('Can not bootstrap!') bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: sys.exit('Can not bootstrap!') self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost') def execute(self): injectJsInFrame(self.m_scriptFile, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True) return not self.m_terminated def printConsoleMessage(self, message, lineNumber, source): if source: message = '%s:%d %s' % (source, lineNumber, message) print message def returnValue(self): return self.m_returnValue ## # Properties and methods exposed to JavaScript ## @pyqtSlot(WebPage) def _destroy(self, page): self.m_pages.remove(page) sip.delete(page) @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtSlot(result=WebPage) def createWebPage(self): page = WebPage(self) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page @pyqtProperty('QVariantMap') def defaultPageSettings(self): return self.m_defaultPageSettings @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_terminated = True self.m_returnValue = code # stop javascript execution in start script; # delete all the pages C++ objects, then clear # the page list, and empty the Phantom page for page in self.m_pages: sip.delete(page) del self.m_pages[:] self.m_page = None QApplication.instance().exit(code) @pyqtSlot(str, result=bool) def injectJs(self, filePath): return injectJsInFrame(filePath, self.libraryPath, self.m_page.mainFrame()) @pyqtProperty(str) def libraryPath(self): return self.m_page.libraryPath @libraryPath.setter def libraryPath(self, dirPath): self.m_page.libraryPath = dirPath @pyqtProperty(str) def scriptName(self): return os.path.basename(self.m_scriptFile) @pyqtProperty('QVariantMap') def version(self): version = { 'major': version_major, 'minor': version_minor, 'patch': version_patch } return version do_action('Phantom')
class Phantom(QObject): def __init__(self, parent, args): super(Phantom, self).__init__(parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self, args) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.app_args = args self.m_scriptFile = args.script self.m_args = args.script_args self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8') self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if args.proxy is None: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['javascriptEnabled'] = True self.m_defaultPageSettings['XSSAuditingEnabled'] = False self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) with QPyFile(':/bootstrap.js') as f: bootstrap = str(f.readAll()) self.m_page.mainFrame().evaluateJavaScript(bootstrap) do_action('PhantomInitPost') def execute(self): injectJsInFrame(self.m_scriptFile, self.m_scriptEncoding.encoding, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True) return not self.m_terminated def printConsoleMessage(self, message, lineNumber, source): if source: message = '%s:%d %s' % (source, lineNumber, message) print message def returnValue(self): return self.m_returnValue ## # Properties and methods exposed to JavaScript ## @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtSlot(result=FileSystem) def createFilesystem(self): return FileSystem(self) @pyqtSlot(result=WebPage) def createWebPage(self): page = WebPage(self, self.app_args) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page @pyqtProperty('QVariantMap') def defaultPageSettings(self): return self.m_defaultPageSettings @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_terminated = True self.m_returnValue = code # stop javascript execution in start script; # delete all the pages C++ objects, then clear # the page list, and empty the Phantom page for page in self.m_pages: sip.delete(page) del self.m_pages[:] self.m_page = None QApplication.instance().exit(code) @pyqtSlot(str, result=bool) def injectJs(self, filePath): return injectJsInFrame(filePath, self.m_scriptEncoding.encoding, self.libraryPath, self.m_page.mainFrame()) @pyqtSlot(str, result=str) def loadModuleSource(self, name): moduleSourceFilePath = ':/modules/%s.js' % name with QPyFile(moduleSourceFilePath) as f: moduleSource = str(f.readAll()) return moduleSource @pyqtProperty(str) def libraryPath(self): return self.m_page.libraryPath @libraryPath.setter def libraryPath(self, dirPath): self.m_page.libraryPath = dirPath @pyqtProperty(str) def outputEncoding(self): return self.m_outputEncoding.name @outputEncoding.setter def outputEncoding(self, encoding): self.m_outputEncoding = Encode(encoding, self.m_outputEncoding.encoding) sys.stdout.encoding = self.m_outputEncoding.encoding sys.stdout.encode_to = self.m_outputEncoding.encoding sys.stderr.encoding = self.m_outputEncoding.encoding sys.stdout.encode_to = self.m_outputEncoding.encoding @pyqtProperty(str) def scriptName(self): return os.path.basename(self.m_scriptFile) @pyqtProperty('QVariantMap') def version(self): version = { 'major': __version_info__[0], 'minor': __version_info__[1], 'patch': __version_info__[2] } return version do_action('Phantom')
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = '' self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = args.script.read() self.m_scriptFile = args.script.name self.m_scriptDir = os.path.dirname(args.script.name) + '/' self.m_args = args.script_args self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script.close() do_action('PhantomInitPre', Bunch(locals())) palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute( QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute( QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.m_page.mainFrame().javaScriptWindowObjectCleared.connect( self.inject) self.m_page.loadFinished.connect(self.finish) do_action('PhantomInitPost', Bunch(locals()))
error_msg = "lxml error" return error_msg, url, redirected_url, html """ if __name__ == "__main__": #''' url = "http://www.cnbeta.com/" downloader = DownloadManager() error_msg, url, redirected_url, html = downloader.download(url) print "error_msg=%s" %error_msg print "url=%s" %url print "redirected_url=%s" %redirected_url f = open("www.cnbeta.com.html",'w') f.write(html) f.close() webpage = WebPage(url, html) webpage.parse_links() website = 'cnbeta\.com' patnstr = '^(http|https)://(.*\.' + website + ')(.+)$'; links = webpage.filter_links(tags=['a'], str_patterns=[patnstr]) links.sort() f_filter_links = open('filter_links_cnbeta.txt', 'w') #print links f = open('links_regged_cnbeta.txt','w') for link in links: f_filter_links.write('%s\n' % link) f.write('%s\n' % link) for elem, attr, lnk, pos in webpage.doc.iterlinks():
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} self.files = [] self.file_rule = ".+" def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') self.repodb = RepoStateDB() def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def set_file_rule(self, rule): self.file_rule = rule def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def download_files(self, files): for f in files: #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir cmd = "wget -c " + f + " -P " + config.repos_dir ret_code = os.system(cmd) self.repodb.update(f, ret_code == 0) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) # print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) print links self.add_seeds(links) file_pattern = [] file_pattern.append(re.compile(self.file_rule)) files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern) self.files.append(files) #TODO: self.download_files(files) print files def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
class Phantom(QObject): def __init__(self, parent, args): super(Phantom, self).__init__(parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self, args) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.app_args = args self.m_scriptFile = args.script self.m_args = args.script_args self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8') self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if args.proxy is None: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['javascriptEnabled'] = True self.m_defaultPageSettings['XSSAuditingEnabled'] = False self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) with QPyFile(':/bootstrap.js') as f: bootstrap = f.readAll() self.m_page.mainFrame().evaluateJavaScript(bootstrap) do_action('PhantomInitPost') def execute(self): injectJsInFrame(self.m_scriptFile, self.m_scriptEncoding.encoding, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True) return not self.m_terminated def printConsoleMessage(self, message, lineNumber, source): if source: message = '%s:%d %s' % (source, lineNumber, message) print message def returnValue(self): return self.m_returnValue ## # Properties and methods exposed to JavaScript ## @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtSlot(result=FileSystem) def createFilesystem(self): return FileSystem(self) @pyqtSlot(result=WebPage) def createWebPage(self): page = WebPage(self, self.app_args) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page @pyqtProperty('QVariantMap') def defaultPageSettings(self): return self.m_defaultPageSettings @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_terminated = True self.m_returnValue = code # stop javascript execution in start script; # delete all the pages C++ objects, then clear # the page list, and empty the Phantom page for page in self.m_pages: sip.delete(page) del self.m_pages[:] self.m_page = None QApplication.instance().exit(code) @pyqtSlot(str, result=bool) def injectJs(self, filePath): return injectJsInFrame(filePath, self.m_scriptEncoding.encoding, self.libraryPath, self.m_page.mainFrame()) @pyqtSlot(str, result=str) def loadModuleSource(self, name): moduleSourceFilePath = ':/modules/%s.js' % name with QPyFile(moduleSourceFilePath) as f: moduleSource = f.readAll() return moduleSource @pyqtProperty(str) def libraryPath(self): return self.m_page.libraryPath @libraryPath.setter def libraryPath(self, dirPath): self.m_page.libraryPath = dirPath @pyqtProperty(str) def outputEncoding(self): return self.m_outputEncoding.name @outputEncoding.setter def outputEncoding(self, encoding): self.m_outputEncoding = Encode(encoding, self.m_outputEncoding.encoding) sys.stdout.encoding = self.m_outputEncoding.encoding sys.stdout.encode_to = self.m_outputEncoding.encoding sys.stderr.encoding = self.m_outputEncoding.encoding sys.stdout.encode_to = self.m_outputEncoding.encoding @pyqtProperty(str) def scriptName(self): return os.path.basename(self.m_scriptFile) @pyqtProperty('QVariantMap') def version(self): version = { 'major': __version_info__[0], 'minor': __version_info__[1], 'patch': __version_info__[2] } return version do_action('Phantom')
def createWebPage(self): page = WebPage(self, self.app_args) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page
class Crawler(): def __init__(self): self.downloader = DownloadManager()#下载网页的对象 self.webpage = None#解析页面的对象 self.initDatabase() self.rules = {} #初始化数据库 def initDatabase(self): self.queue = QueueDB()#TODO 表 self.webpagedb = WebpageDB() self.duplcheck = DuplCheckDB() #增加种子url #参数: links url 列表 def addSeeds(self, links): new_links = self.duplcheck.filterDuplUrls(links)#把重复的url过滤掉 self.duplcheck.addUrls(new_links)#已经访问过的url self.queue.pushUrls(new_links)#向todo表中增加url def addRules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) #开始执行 def start(self): while 1: url = self.queue.popUrl() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来 self.webpage = WebPage(url,html)#开始解析网页 self.webpage.parseLinks()#得到全部的超链接 ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None if links: self.addSeeds(links) self.mysleep(3)#休息一下再继续爬 def mysleep(self, n): for i in range(1,n+1): time.sleep(1) print "sleep",i,"of",n
def OnMouseDown(event, arg, login, password, instr, flag, link, canvas): words = [] for i in xrange(len(arg)): if (arg[i].get() != ""): words += [arg[i].get()] if flag.get() == 1: # only to set the link page = WebPage(words) link.set(page.link) # get data locally globalFile = open('data.csv', 'r') globalC = globalFile.read() globalCon = "" for line in globalC: globalCon += line regex = '[0-9]+-.*?-.*?' for i in xrange(len(words)): regex += ',[0-9]+' globalInfo = re.findall(regex,globalCon) regex = "" for i in xrange(len(words)): regex += ',[0-9]+' globalData = [] for i in xrange(len(globalInfo)): temp = re.findall(regex,globalInfo[i]) num = re.findall(r'\d+',temp[0]) globalData += num # read individual allwords = [] for i in xrange(len(words)): string = words[i] + ".csv" fle = open(string, 'r') file1 = fle.read() con = "" for line in file1: con += line info = re.findall(r'[0-9]+-.*?-.*?,[0-9]+',con) data = [] for j in xrange(len(info)): temp = re.findall(',[0-9]+',info[j]) num = re.findall(r'\d+',temp[0]) data += num allwords.append(data) # output output = open('Java/LetTheDataSpeak/data.txt','w') for i in xrange(len(allwords[0])): for j in xrange(len(allwords)): output.write(allwords[j][i] + " " + globalData[i*len(words)+j] + " ") output.write("\n") else: # get data from website page = WebPage(words) link.set(page.link) page.getContent(login,password) # get global data globalFile = open('data.csv', 'r') globalC = globalFile.read() globalCon = "" for line in globalC: globalCon += line regex = '[0-9]+-.*?-.*?' for i in xrange(len(words)): regex += ',[0-9]+' globalInfo = re.findall(regex,globalCon) regex = "" for i in xrange(len(words)): regex += ',[0-9]+' globalData = [] for i in xrange(len(globalInfo)): temp = re.findall(regex,globalInfo[i]) num = re.findall(r'\d+',temp[0]) globalData += num for i in xrange(len(words)): page = WebPage([words[i]]) link.set(page.link) page.getContent(login,password) # rename for i in xrange(len(words)): bashCommand = "rename " bashCommand += "data(" + str(i+1) + ").csv " + words[i] + ".csv" process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output = process.communicate()[0] # read individual allwords = [] for i in xrange(len(words)): string = words[i] + ".csv" fle = open(string, 'r') file1 = fle.read() con = "" for line in file1: con += line info = re.findall(r'[0-9]+-.*?-.*?,[0-9]+',con) data = [] for j in xrange(len(info)): temp = re.findall(',[0-9]+',info[j]) num = re.findall(r'\d+',temp[0]) data += num allwords.append(data) # output output = open('Java/LetTheDataSpeak/data.txt','w') for i in xrange(len(allwords[0])): for j in xrange(len(allwords)): output.write(allwords[j][i] + " " + globalData[i*len(words)+j] + " ") output.write("\n") canvas.create_line(0,114,520,114, fill="black", width = 3) color = ["coral", "yellow", "green", "blue", "red"] # visualisation for i in xrange(len(words)): for j in xrange(len(allwords[0])): if (j<520): canvas.create_line(j,110-int(allwords[i][j]),j+1,110-int(allwords[i][j+1]), fill=color[i], width=1) import time time.sleep(5) ARGS = "ARGS=" for i in xrange(len(words)): ARGS += instr[i].get() + ' ' ARGS += "" print ARGS sys.stdout.flush() subprocess.call(["make", ARGS])
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download( url) #print error_msg, url, redirected_url, html if html != None: self.webpagedb.html2db(url, html) links = self.getlinks(url, html) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep", i, "of", n
def get(self, url): r = requests.get(self.baseUrl + url) return WebPage(r.text)
print("Usage: {} <source_url> [max num of internal_links]".format( sys.argv[0])) sys.exit(0) source_url = sys.argv[1] max_sites = float('inf') if len(sys.argv) == 3: max_sites = int(sys.argv[2]) to_process = [source_url] visited = set() internal_links = [] while len(to_process): url = to_process.pop() w = WebPageLinkExtractor(WebPage(url)) if len(visited) >= max_sites: break for web in w.internal_links: index = web.url.find('#') if index != -1: web.url = web.url[:index] if web.url not in visited: visited.add(web.url) to_process.append(web.url) time.sleep(0.3) for link in visited:
def createWebPage(self): page = WebPage(self) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) page.scriptLookupDir = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page
class Phantom(QObject): def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = '' self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = args.script.read() self.m_scriptFile = args.script.name self.m_scriptDir = os.path.dirname(args.script.name) + '/' self.m_args = args.script_args self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script.close() do_action('PhantomInitPre', Bunch(locals())) palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute( QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute( QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.m_page.mainFrame().javaScriptWindowObjectCleared.connect( self.inject) self.m_page.loadFinished.connect(self.finish) do_action('PhantomInitPost', Bunch(locals())) def execute(self): if self.m_script.startswith('#!'): self.m_script = '//' + self.m_script if self.m_scriptFile.lower().endswith('.coffee'): coffee = CSConverter(self) self.m_script = coffee.convert(self.m_script) self.m_page.mainFrame().evaluateJavaScript(self.m_script) def finish(self, success): self.m_loadStatus = 'success' if success else 'fail' self.m_page.mainFrame().evaluateJavaScript(self.m_script) def inject(self): self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) def renderPdf(self, fileName): p = QPrinter() p.setOutputFormat(QPrinter.PdfFormat) p.setOutputFileName(fileName) p.setResolution(pdf_dpi) paperSize = self.m_paperSize if not len(paperSize): pageSize = QSize(self.m_page.mainFrame().contentsSize()) paperSize['width'] = str(pageSize.width()) + 'px' paperSize['height'] = str(pageSize.height()) + 'px' paperSize['border'] = '0px' if paperSize.get('width') and paperSize.get('height'): sizePt = QSizeF(ceil(self.stringToPointSize(paperSize['width'])), ceil(self.stringToPointSize(paperSize['height']))) p.setPaperSize(sizePt, QPrinter.Point) elif 'format' in paperSize: orientation = QPrinter.Landscape if paperSize.get( 'orientation') and paperSize['orientation'].lower( ) == 'landscape' else QPrinter.Portrait orientation = QPrinter.Orientation(orientation) p.setOrientation(orientation) formats = { 'A3': QPrinter.A3, 'A4': QPrinter.A4, 'A5': QPrinter.A5, 'Legal': QPrinter.Legal, 'Letter': QPrinter.Letter, 'Tabloid': QPrinter.Tabloid } p.setPaperSize(QPrinter.A4) # fallback for format, size in formats.items(): if format.lower() == paperSize['format'].lower(): p.setPaperSize(size) break else: return False border = floor(self.stringToPointSize( paperSize['border'])) if paperSize.get('border') else 0 p.setPageMargins(border, border, border, border, QPrinter.Point) self.m_page.mainFrame().print_(p) return True def returnValue(self): return self.m_returnValue def stringToPointSize(self, string): units = (('mm', 72 / 25.4), ('cm', 72 / 2.54), ('in', 72.0), ('px', 72.0 / pdf_dpi / 2.54), ('', 72.0 / pdf_dpi / 2.54)) for unit, format in units: if string.endswith(unit): value = string.rstrip(unit) return float(value) * format return 0 ## # Properties and methods exposed to JavaScript ## @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtProperty('QVariantMap') def clipRect(self): result = { 'width': self.m_clipRect.width(), 'height': self.m_clipRect.height(), 'top': self.m_clipRect.top(), 'left': self.m_clipRect.left() } return result @clipRect.setter def clipRect(self, size): names = ('width', 'height', 'top', 'left') for item in names: try: globals()[item] = int(size[item]) if globals()[item] < 0: if item not in ('top', 'left'): globals()[item] = 0 except KeyError: globals()[item] = getattr(self.m_clipRect, item)() self.m_clipRect = QRect(left, top, width, height) @pyqtProperty(str) def content(self): return self.m_page.mainFrame().toHtml() @content.setter def content(self, content): self.m_page.mainFrame().setHtml(content) @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_returnValue = code self.m_page.loadFinished.disconnect(self.finish) QTimer.singleShot(0, qApp, SLOT('quit()')) @pyqtProperty(str) def loadStatus(self): return self.m_loadStatus @pyqtSlot(str, result=bool) def loadScript(self, script): if script in self.m_loadScript_cache: self.m_page.mainFrame().evaluateJavaScript( self.m_loadScript_cache[script]) return True scriptFile = script try: script = codecs.open(self.m_scriptDir + script, encoding='utf-8') script = script.read() except IOError: return False if script.startswith('#!'): script = '//' + script if scriptFile.lower().endswith('.coffee'): coffee = CSConverter(self) script = coffee.convert(script) self.m_loadScript_cache[scriptFile] = script self.m_page.mainFrame().evaluateJavaScript(script) return True @pyqtSlot(str, name='open') def open_(self, address): qDebug('Opening address %s' % address) self.m_page.triggerAction(QWebPage.Stop) self.m_loadStatus = 'loading' self.m_page.mainFrame().setUrl(QUrl(address)) @pyqtProperty('QVariantMap') def paperSize(self): return self.m_paperSize @paperSize.setter def paperSize(self, size): self.m_paperSize = size @pyqtSlot(str, result=bool) def render(self, fileName): fileInfo = QFileInfo(fileName) path = QDir() path.mkpath(fileInfo.absolutePath()) if fileName.lower().endswith('.pdf'): return self.renderPdf(fileName) viewportSize = QSize(self.m_page.viewportSize()) pageSize = QSize(self.m_page.mainFrame().contentsSize()) bufferSize = QSize() if not self.m_clipRect.isEmpty(): bufferSize = self.m_clipRect.size() else: bufferSize = self.m_page.mainFrame().contentsSize() if pageSize == '': return False image = QImage(bufferSize, QImage.Format_ARGB32) image.fill(qRgba(255, 255, 255, 0)) p = QPainter(image) p.setRenderHint(QPainter.Antialiasing, True) p.setRenderHint(QPainter.TextAntialiasing, True) p.setRenderHint(QPainter.SmoothPixmapTransform, True) self.m_page.setViewportSize(pageSize) if not self.m_clipRect.isEmpty(): p.translate(-self.m_clipRect.left(), -self.m_clipRect.top()) self.m_page.mainFrame().render(p, QRegion(self.m_clipRect)) else: self.m_page.mainFrame().render(p) p.end() self.m_page.setViewportSize(viewportSize) return image.save(fileName) @pyqtSlot('QWebElement', str) def setFormInputFile(self, el, fileTag): self.m_page.m_nextFileTag = fileTag el.evaluateJavaScript('''(function(target){ var evt = document.createEvent('MouseEvents'); evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null); target.dispatchEvent(evt);})(this);''') @pyqtSlot(int) def sleep(self, ms): startTime = QTime.currentTime() while True: QApplication.processEvents(QEventLoop.AllEvents, 25) if startTime.msecsTo(QTime.currentTime()) > ms: break usleep(0.005) @pyqtProperty(str) def state(self): return self.m_state @state.setter def state(self, value): self.m_state = value @pyqtProperty(str) def userAgent(self): return self.m_page.m_userAgent @userAgent.setter def userAgent(self, ua): self.m_page.m_userAgent = ua @pyqtSlot(str, result='QVariant') @pyqtSlot(int, result='QVariant') @pyqtSlot(str, 'QVariant') @pyqtSlot(int, 'QVariant') def ctx(self, name, value=None): if not value: return self.m_var.get(name) self.m_var[name] = value @pyqtProperty('QVariantMap') def version(self): version = { 'major': version_major, 'minor': version_minor, 'patch': version_patch } return version @pyqtProperty('QVariantMap') def viewportSize(self): size = self.m_page.viewportSize() result = {'width': size.width(), 'height': size.height()} return result @viewportSize.setter def viewportSize(self, size): names = ('width', 'height') for item in names: try: globals()[item] = int(size[item]) if globals()[item] < 0: globals()[item] = 0 except KeyError: globals()[item] = getattr(self.m_page.viewportSize(), item)() self.m_page.setViewportSize(QSize(width, height)) do_action('Phantom', Bunch(locals()))
class WebBrowser(QObject): def __init__(self): logging.debug("-->") super(WebBrowser, self).__init__() self.app = QApplication.instance() if self.app is None: self.app = QApplication(sys.argv) self.app.setQuitOnLastWindowClosed(False) self.event_loop = QEventLoop() self.cookie_jar = CookieJar() self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888) self.network_manager = NetworkAccessManager() self.network_manager.setCookieJar(self.cookie_jar) # self.network_manager.setProxy(self.proxy) self.web_page = WebPage() self.web_page.setNetworkAccessManager(self.network_manager) self.web_view = QWebView() self.web_view.setPage(self.web_page) self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False) self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True) self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True) # self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False) self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished) self.page_loaded_validator = None self.page_loaded_handler = None self.page_loaded_handler_kwargs = None self.timeout_message = None self.timer = None self.event_loop_exception = None logging.debug("<--") def network_reply_finished(self,reply): logging.debug("Reply received for: " + reply.request().url().toString()) self.network_manager.request_queue[reply.request().url()] = "Completed" redirect_url = self.get_redirect_url(reply.attribute(QNetworkRequest.RedirectionTargetAttribute),reply.request().url()) if redirect_url is not None: self.redirect(redirect_url,reply.request()) def redirect(self,url,request): frame = self.find_frame_to_redirect(self.web_view.page().mainFrame(),request) if frame is not None: logging.debug("Redirecting to: " + url.toString()) frame.load(url) def find_frame_to_redirect(self,frame,request): if frame.requestedUrl() == request.url(): return frame else: children = frame.childFrames() for child in children: frame_to_redirect = self.find_frame_to_redirect(child,request) if frame_to_redirect is not None: return frame_to_redirect def get_redirect_url(self,possible_redirect_url, orig_requested_url): if possible_redirect_url is not None: if possible_redirect_url.isRelative(): if orig_requested_url.isRelative(): return None possible_redirect_url.setScheme(orig_requested_url.scheme()) possible_redirect_url.setHost(orig_requested_url.host()) if orig_requested_url != possible_redirect_url: return possible_redirect_url def get_cookies(self): cookies = self.cookie_jar.allCookies() raw_cookies = [] first = True for cookie in cookies: raw_cookies.append(cookie.toRawForm()) return raw_cookies def set_cookies(self,raw_cookies): cookies = [] for raw_cookie in raw_cookies: cookie_list = QNetworkCookie.parseCookies(raw_cookie) for cookie in cookie_list: cookies.append(cookie) self.cookie_jar.setAllCookies(cookies) def cleanup(self): logging.debug("-->") self.disconnect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished) self.web_view.setParent(None) self.web_page.setParent(None) self.network_manager.setParent(None) self.event_loop.setParent(None) self.setParent(None) del self.web_view del self.web_page del self.network_manager del self.event_loop del self.app logging.debug("<--")
class Phantom(QObject): def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_script = args.script self.m_scriptFile = args.script_name self.m_args = args.script_args do_action('PhantomInitPre', Bunch(locals())) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_page.applySettings(self.m_defaultPageSettings) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): qCritical('Can not bootstrap!') sys.exit(1) bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: qCritical('Can not bootstrap!') sys.exit(1) self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost', Bunch(locals())) def execute(self): if self.m_scriptFile.lower().endswith('.coffee'): coffee = CSConverter(self) self.m_script = coffee.convert(self.m_script) if self.m_script.startswith('#!'): self.m_script = '//' + self.m_script self.m_page.mainFrame().evaluateJavaScript(self.m_script) return not self.m_terminated def printConsoleMessage(self, msg): print msg def returnValue(self): return self.m_returnValue ## # Properties and methods exposed to JavaScript ## @pyqtProperty('QStringList') def args(self): return self.m_args @pyqtSlot(result=WebPage) def createWebPage(self): page = WebPage(self) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) return page @pyqtProperty('QVariantMap') def defaultPageSettings(self): return self.m_defaultPageSettings @pyqtSlot() @pyqtSlot(int) def exit(self, code=0): self.m_terminated = True self.m_returnValue = code QApplication.instance().exit(code) @pyqtProperty('QVariantMap') def version(self): version = { 'major': version_major, 'minor': version_minor, 'patch': version_patch } return version do_action('Phantom', Bunch(locals()))
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = QString() self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = QString.fromUtf8(args.script[0].read()) self.m_scriptFile = args.script[0].name self.m_args = args.script[1:] self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script[0].close() palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute( QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute( QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) # if our script was called in a different directory, change to it # to make any dealings with files be relative to the scripts directory if os.path.dirname(self.m_scriptFile): os.chdir(os.path.dirname(self.m_scriptFile)) if self.m_verbose: m_netAccessMan = NetworkAccessManager(self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.connect(self.m_page.mainFrame(), SIGNAL('javaScriptWindowObjectCleared()'), self.inject) self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
def createWebPage(self): page = WebPage(self) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) return page