Beispiel #1
0
 def __init__(self, parent, url=''):
     super(WebView, self).__init__(parent)
     self.draging = False
     self.drag = QDrag(self)
     self.dragStartPos = None
     self.webPage = WebPage()
     self.setPage(self.webPage)
     self.mainFrame = self.page().mainFrame()
     self.setAttribute(Qt.WA_DeleteOnClose, True)
     self.titleChanged.connect(parent.setWindowTitle)
     self.load(url)
     webSettings = self.settings()
     webSettings.setDefaultTextEncoding("utf-8")
     # webSettings.setOfflineStorageDefaultQuota(sys.maxsize)
     # webSettings.setOfflineWebApplicationCacheQuota(sys.maxsize)
     webSettings.enablePersistentStorage(assets.fs.dataPath())
     webSettings.setAttribute(QWebSettings.PluginsEnabled, True)
     webSettings.setAttribute(QWebSettings.DnsPrefetchEnabled, True)
     webSettings.setAttribute(QWebSettings.XSSAuditingEnabled, True)
     webSettings.setAttribute(QWebSettings.CSSGridLayoutEnabled, True)
     webSettings.setAttribute(QWebSettings.ScrollAnimatorEnabled, True)
     webSettings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
     webSettings.setAttribute(QWebSettings.JavascriptCanOpenWindows, True)
     webSettings.setAttribute(QWebSettings.JavascriptCanCloseWindows, True)
     webSettings.setAttribute(QWebSettings.JavascriptCanAccessClipboard,
                              True)
     webSettings.setAttribute(QWebSettings.LocalContentCanAccessFileUrls,
                              True)
     webSettings.setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls,
                              True)
     self.mainFrame.javaScriptWindowObjectCleared.connect(
         self.setJavaScriptObject)
     self.mainFrame.iconChanged.connect(self.changeIcon)
Beispiel #2
0
    def crawl(self):

        while len(self.url_queue) > 0 and len(
                self.discovered) <= self.MAX_LINKS_TO_VISIT:
            url = self.url_queue.popleft()

            if 'DEBUG' in os.environ:
                print "Queue Size:", len(self.url_queue)
                print "Fetching: ", url

            webpage = WebPage(url)
            self.unvisited[url] = False

            all_links = webpage.get_anchors(
                False)  # False: dont keep fragments
            all_assets = webpage.get_assets()
            self.assets.append({'url': url, 'assets': all_assets})

            for link in all_links:

                # if belongs to same domain & is not already discovered
                if self.same_domain_rule.matches(
                        link) and self.discovered[link.geturl()] is None:
                    self.discovered[link.geturl()] = True
                    # process if not already in the queue
                    if self.unvisited[link.geturl()] is None:
                        self.url_queue.append(link.geturl())
                        self.unvisited[link.geturl()] = True
Beispiel #3
0
    def __init__(self, profile: QWebEngineProfile):
        super().__init__()

        self.m_addressBar = UrlLineEdit(self)
        self.m_view = WebView(self)
        self.setAttribute(Qt.WA_DeleteOnClose)
        self.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Minimum)

        layout = QVBoxLayout()
        # layout.setMargin(0)
        self.setLayout(layout)
        layout.addWidget(self.m_addressBar)
        layout.addWidget(self.m_view)

        self.m_view.setPage(WebPage(profile, self.m_view))
        self.m_view.setFocus()
        self.m_addressBar.setReadOnly(True)
        self.m_addressBar.setFavIcon(QIcon(":defaulticon.png"))

        self.m_view.titleChanged.connect(self.setWindowTitle)
        self.m_view.urlChanged.connect(self.setUrl)
        self.m_view.page().iconChanged.connect(self.handleIconChanged)
        self.m_view.page().geometryChangeRequested.connect(
            self.handleGeometryChangeRequested)
        self.m_view.page().windowCloseRequested.connect(self.close)
Beispiel #4
0
 def createWebPage(self):
     page = WebPage(self)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.setNetworkAccessManager(self.m_netAccessMan)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
Beispiel #5
0
 def getlinks(self, url, html):
     self.webpage = WebPage(url, html)
     self.webpage.parse_links()
     ruptn = self.get_patterns_from_rules(url)
     #print ruptn
     links = self.webpage.filter_links(tags=['a'], patterns=ruptn)
     return links
Beispiel #6
0
def Crawl(address, recursions=2, txt=""):
    '''
    gathers 5 random links from given website
    repeats recursively to each of the links that were found
    '''
    links_amount = 0
    max_links_amount = 5
    web_page = WebPage(address)
    links = web_page.get_links(only_external_links_allowed=True)
    #print("Links on page: {} found: {}".format(address, len(links)))

    if recursions > 0:
        recursions -= 1
        for link in links:
            if links_amount >= max_links_amount:
                break
            if link not in words_list.keys():
                try:
                    Crawl(link, recursions, txt)
                    all_text = web_page.get_all_text()
                    words_parser = GetWords(all_text)
                    words_list[link] = words_parser.top_words(5)
                    links_amount += 1
                except (TooFewLinksOnPage, BannedExtensionPage,
                        ConnectionError):
                    #print("Getting page from previous round...")
                    continue
Beispiel #7
0
 def start(self):
     while 1:
         try:
             url = self.dbop.pop_url()
             print "url: %s" % url
             if url == None:
                 print "crawling task is done."
                 break
             error_msg, url, redirected_url, html = self.downloader.download(url)
             #print error_msg, url, redirected_url, html
             if html != None:
                 self.webpage = WebPage(url, html)
                 article = self.webpage.extract()
                 if len(article) > 5:
                     addtime = "%s %s" % (article[1], article[2])
                     self.dbop.html2db(url, html,
                                       article[0],
                                       addtime,
                                       article[3],
                                       article[5])
                 else:
                     self.dbop.html2db(url, html)
                 print self.webpage.parse_links()
                 ruptn = self.get_patterns_from_rules(url)
                 links = self.webpage.filter_links(tags=['a'],
                                                   str_patterns=ruptn)
                 self.add_seeds(links)
             self.mysleep(3)
         except Exception, err:
            print "!!error!! Exception happend! %s %s" % (url, err)
            self.dbop.close()
Beispiel #8
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_scriptFile = args.script
        self.m_args = args.script_args

        self.m_filesystem = FileSystem(self)
        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache,
                                                   args.ignore_ssl_errors,
                                                   self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(
            self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings[
            'localAccessRemote'] = args.local_access_remote
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
        self.m_page.mainFrame().addToJavaScriptWindowObject(
            'fs', self.m_filesystem)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            sys.exit('Can not bootstrap!')
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            sys.exit('Can not bootstrap!')
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost')
    def __init__(self):
        super(Window, self).__init__()
        self.view = QWebView(self)
        self.view.setPage(WebPage())

        layout = QVBoxLayout(self)
        layout.setMargin(0)
        layout.addWidget(self.view)
Beispiel #10
0
 def __init__(self, settings):
     super(Browser, self).__init__()
     self._settings = settings
     self._cookie_jar = CookieJar()
     self._network_access_manager = NetworkAccessManager(self._cookie_jar)
     self._web_page = WebPage(settings)
     self._web_page.setNetworkAccessManager(self._network_access_manager)
     self._web_view = WebView(settings)
     self._web_view.setPage(self._web_page)
Beispiel #11
0
 def createTab(self, makeCurrent: bool = True) -> WebView:
     webView = WebView()
     webPage = WebPage(QWebEngineProfile.defaultProfile(), webView)
     webView.setPage(webPage)
     self.setupView(webView)
     self.addTab(webView, self.tr("(Untitled)"))
     if makeCurrent:
         self.setCurrentWidget(webView)
     return webView
Beispiel #12
0
 def test_file_links(self):
     self.start_server(TestWebPage.FILE_LINKS_HTML)
     webpage = WebPage(TestWebPage.SERVER)
     self.assertEqual(0, len(webpage.get_js()))
     self.assertEqual(0, len(webpage.get_stylesheets()))
     self.assertEqual(0, len(webpage.get_links()))
     self.assertEqual(2, len(webpage.get_anchors()))
     self.assertEqual(0, len(webpage.get_images()))
     self.assertEqual(2, len(webpage.get_files()))
Beispiel #13
0
    def updateView(self):
        page = WebPage(logger=None, parent=self)
        page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks)
        page.mainFrame().addToJavaScriptWindowObject("qtWindow", self)
        self.ui.webView.setPage(page)

        html = self.generateHtml()
        # baseUrl must end with a trailing '/' otherwise QWebView won't be able
        # to load files from there
        baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/"))
        self.ui.webView.setHtml(html, baseUrl)
Beispiel #14
0
    def get_course_manager(self, semester, departments, show_html=0):
        htmldata = self.get_course_page(semester, departments)

        if show_html:
            self.show_html(htmldata)

        htmlpath = "temp.html"
        with open(htmlpath, "w") as f:
            f.write(htmldata)
        webpage = WebPage(htmlpath=htmlpath)
        cm = CourseManager(webpage, logger=self.logger)
        return cm
Beispiel #15
0
    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_servers = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding,
                                       sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(args.proxy_type, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(
            self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings[
            'localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            self.m_page.mainFrame().evaluateJavaScript(f.readAll())

        do_action('PhantomInitPost')
Beispiel #16
0
def process_web_page(thread_id, sites, q):
    webpages = []

    for site in sites:
        try:
            w = WebPageLinkExtractor(WebPage(site))
            # Discard invalid webpages
            for web in w.external_links:
                if FORBIDDEN_DOMAINS.match(web.qdn) or\
                   FORBIDDEN_FILETYPES.match(web.url):
                    continue
                webpages.append(web)
            time.sleep(0.1)
        except:
            pass

    q.put(webpages)
Beispiel #17
0
 def run(self):
     while self.status:
         try:
             url = self.spider.task_list.get(timeout=1)
         except Empty:
             # log.info('%s: task_list Empty' % self.name)
             continue
         self.spider.increase_running()
         if not self.spider.check_robots(url):
             log.info('%s - robots forbidden : %s' % (self.name, url))
             continue
         page = WebPage(url)
         # print('%s prepare to fetch %s' % (self.name, url))
         if page.fetch():
             self.spider.db.save_data(page.get_data())
             for link in page.get_link():  # retrive links from html
                 if link not in self.spider.visited_list:  # not visited yet
                     self.spider.extend_list.add(link)
         else:
             print('%s: Page fetch failed: %s' % (self.name, page.url))
         self.spider.decrease_running()
Beispiel #18
0
    def get_all_urls(self):
        """Get all building urls from all index pages.

        Returns:
            dict: {"page 1": [url1, url2, url3 ...], "page 2": [...], ...}
        """
        all_url_list = {}
        page_count = 0
        for page_url in self.get_all_page_urls():
            single_page_urls = []
            page_count += 1
            building_url_list = WebPage(page_url).get_soup().findAll(
                'a', {"se:clickable:target": "true"})
            for building in building_url_list:
                building_url = "https://streeteasy.com" + building.get("href")
                single_page_urls.append(building_url)
                IndexPage.url_count += 1
            all_url_list[f"page {page_count}"] = single_page_urls
            print(f"Retrived all url on page {page_count}")
        print("=============================")
        print(f"RETRIVED ALL URLS FOR {page_count} PAGES")
        print(f"TOTAL URLS IDENTIFIED: ", IndexPage.url_count)
        return all_url_list
Beispiel #19
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = ''
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = args.script.read()
        self.m_scriptFile = args.script.name
        self.m_scriptDir = os.path.dirname(args.script.name) + '/'
        self.m_args = args.script_args
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script.close()

        do_action('PhantomInitPre', Bunch(locals()))

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages,
                                            autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled,
                                            pluginsEnabled)
        self.m_page.settings().setAttribute(
            QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(
            QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled,
                                            True)
        self.m_page.settings().setLocalStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal,
                                                   Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical,
                                                   Qt.ScrollBarAlwaysOff)

        m_netAccessMan = NetworkAccessManager(args.disk_cache,
                                              args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(
            self.inject)
        self.m_page.loadFinished.connect(self.finish)

        do_action('PhantomInitPost', Bunch(locals()))
Beispiel #20
0
            error_msg = "lxml error"
        return error_msg, url, redirected_url, html
    """
        
if __name__ == "__main__":
    #'''
    url = "http://www.cnbeta.com/"
    downloader = DownloadManager()
    error_msg, url, redirected_url, html = downloader.download(url)
    print "error_msg=%s" %error_msg
    print "url=%s" %url
    print "redirected_url=%s" %redirected_url
    f = open("www.cnbeta.com.html",'w')
    f.write(html)
    f.close()
    webpage = WebPage(url, html)
    webpage.parse_links()

    website = 'cnbeta\.com'
    patnstr = '^(http|https)://(.*\.' + website + ')(.+)$';
    links = webpage.filter_links(tags=['a'], str_patterns=[patnstr])
    links.sort()

    f_filter_links = open('filter_links_cnbeta.txt', 'w')

    #print links
    f = open('links_regged_cnbeta.txt','w')
    for link in links:
        f_filter_links.write('%s\n' % link)
        f.write('%s\n' % link)
        for elem, attr, lnk, pos in webpage.doc.iterlinks():
        print("Usage: {} <source_url> [max num of internal_links]".format(
            sys.argv[0]))
        sys.exit(0)

    source_url = sys.argv[1]
    max_sites = float('inf')
    if len(sys.argv) == 3:
        max_sites = int(sys.argv[2])

    to_process = [source_url]
    visited = set()
    internal_links = []

    while len(to_process):
        url = to_process.pop()
        w = WebPageLinkExtractor(WebPage(url))

        if len(visited) >= max_sites:
            break

        for web in w.internal_links:
            index = web.url.find('#')
            if index != -1:
                web.url = web.url[:index]
            if web.url not in visited:
                visited.add(web.url)
                to_process.append(web.url)

        time.sleep(0.3)

    for link in visited:
Beispiel #22
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = QString()
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = QString.fromUtf8(args.script[0].read())
        self.m_scriptFile = args.script[0].name
        self.m_args = args.script[1:]
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script[0].close()

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages,
                                            autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled,
                                            pluginsEnabled)
        self.m_page.settings().setAttribute(
            QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(
            QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled,
                                            True)
        self.m_page.settings().setLocalStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal,
                                                   Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical,
                                                   Qt.ScrollBarAlwaysOff)

        # if our script was called in a different directory, change to it
        # to make any dealings with files be relative to the scripts directory
        if os.path.dirname(self.m_scriptFile):
            os.chdir(os.path.dirname(self.m_scriptFile))

        if self.m_verbose:
            m_netAccessMan = NetworkAccessManager(self)
            self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.connect(self.m_page.mainFrame(),
                     SIGNAL('javaScriptWindowObjectCleared()'), self.inject)
        self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
Beispiel #23
0
 def get(self, url):
     r = requests.get(self.baseUrl + url)
     return WebPage(r.text)
Beispiel #24
0
 def createWebPage(self):
     page = WebPage(self, self.app_args)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page