def set_data(self):
        w=WebPage(htmldata=self.htmldata1)

        #find all the codes for season/year in the first html form data
        self.semesters={}
        xpath="""//*[@id="CLASS_SRCH_WRK2_STRM$35$"]/option"""
        for e in w.get_from_xpath(xpath):
            key,semester=e.text,e.get("value")
            if key.strip() and semester.strip():
                self.semesters[key]=semester

        #match up season/year to the codes we just found, if possible
        code=0
        for key in self.semesters:
            if self.season.lower() in key.lower() and str(self.year) in key:
                code=self.semesters[key]
                break

        if not code:
            print_color("Warning: failed to find season/year in search options. season='%s' year='%s'"%(self.season,self.year),COLORS.RED)
            print_d("search options",self.semesters)

        self.data={"ICFocus":"SSR_CLSRCH_WRK_ACAD_CAREER$2",
                "CLASS_SRCH_WRK2_STRM$35$":str(code),
                "SSR_CLSRCH_WRK_SUBJECT$0":self.department,
                "ICAction":"CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH",
                "SSR_CLSRCH_WRK_ACAD_CAREER$2":self.level}
Exemple #2
0
    def crawl(self):

        while len(self.url_queue) > 0 and len(
                self.discovered) <= self.MAX_LINKS_TO_VISIT:
            url = self.url_queue.popleft()

            if 'DEBUG' in os.environ:
                print "Queue Size:", len(self.url_queue)
                print "Fetching: ", url

            webpage = WebPage(url)
            self.unvisited[url] = False

            all_links = webpage.get_anchors(
                False)  # False: dont keep fragments
            all_assets = webpage.get_assets()
            self.assets.append({'url': url, 'assets': all_assets})

            for link in all_links:

                # if belongs to same domain & is not already discovered
                if self.same_domain_rule.matches(
                        link) and self.discovered[link.geturl()] is None:
                    self.discovered[link.geturl()] = True
                    # process if not already in the queue
                    if self.unvisited[link.geturl()] is None:
                        self.url_queue.append(link.geturl())
                        self.unvisited[link.geturl()] = True
Exemple #3
0
def Crawl(address, recursions=2, txt=""):
    '''
    gathers 5 random links from given website
    repeats recursively to each of the links that were found
    '''
    links_amount = 0
    max_links_amount = 5
    web_page = WebPage(address)
    links = web_page.get_links(only_external_links_allowed=True)
    #print("Links on page: {} found: {}".format(address, len(links)))

    if recursions > 0:
        recursions -= 1
        for link in links:
            if links_amount >= max_links_amount:
                break
            if link not in words_list.keys():
                try:
                    Crawl(link, recursions, txt)
                    all_text = web_page.get_all_text()
                    words_parser = GetWords(all_text)
                    words_list[link] = words_parser.top_words(5)
                    links_amount += 1
                except (TooFewLinksOnPage, BannedExtensionPage,
                        ConnectionError):
                    #print("Getting page from previous round...")
                    continue
Exemple #4
0
 def getlinks(self, url, html):
     self.webpage = WebPage(url, html)
     self.webpage.parse_links()
     ruptn = self.get_patterns_from_rules(url)
     #print ruptn
     links = self.webpage.filter_links(tags=['a'], patterns=ruptn)
     return links
Exemple #5
0
 def start(self):
     while 1:
         try:
             url = self.dbop.pop_url()
             print "url: %s" % url
             if url == None:
                 print "crawling task is done."
                 break
             error_msg, url, redirected_url, html = self.downloader.download(url)
             #print error_msg, url, redirected_url, html
             if html != None:
                 self.webpage = WebPage(url, html)
                 article = self.webpage.extract()
                 if len(article) > 5:
                     addtime = "%s %s" % (article[1], article[2])
                     self.dbop.html2db(url, html,
                                       article[0],
                                       addtime,
                                       article[3],
                                       article[5])
                 else:
                     self.dbop.html2db(url, html)
                 print self.webpage.parse_links()
                 ruptn = self.get_patterns_from_rules(url)
                 links = self.webpage.filter_links(tags=['a'],
                                                   str_patterns=ruptn)
                 self.add_seeds(links)
             self.mysleep(3)
         except Exception, err:
            print "!!error!! Exception happend! %s %s" % (url, err)
            self.dbop.close()
Exemple #6
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_scriptFile = args.script
        self.m_args = args.script_args

        self.m_filesystem = FileSystem(self)
        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache,
                                                   args.ignore_ssl_errors,
                                                   self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(
            self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings[
            'localAccessRemote'] = args.local_access_remote
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
        self.m_page.mainFrame().addToJavaScriptWindowObject(
            'fs', self.m_filesystem)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            sys.exit('Can not bootstrap!')
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            sys.exit('Can not bootstrap!')
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost')
Exemple #7
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
    
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
                
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                self.add_seeds(links)
            self.mysleep(3)        

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
Exemple #8
0
 def __init__(self, settings):
     super(Browser, self).__init__()
     self._settings = settings
     self._cookie_jar = CookieJar()
     self._network_access_manager = NetworkAccessManager(self._cookie_jar)
     self._web_page = WebPage(settings)
     self._web_page.setNetworkAccessManager(self._network_access_manager)
     self._web_view = WebView(settings)
     self._web_view.setPage(self._web_page)
Exemple #9
0
class Crawler():
    def __init__(self):
        self.downloader = DownloadManager()
        self.webpage = None
        self.rules = {}
        self.dbop = OperatorDB()

    def add_seeds(self, links):
        self.dbop.add_seeds(links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            try:
                url = self.dbop.pop_url()
                print "url: %s" % url
                if url == None:
                    print "crawling task is done."
                    break
                error_msg, url, redirected_url, html = self.downloader.download(url)
                #print error_msg, url, redirected_url, html
                if html != None:
                    self.webpage = WebPage(url, html)
                    article = self.webpage.extract()
                    if len(article) > 5:
                        addtime = "%s %s" % (article[1], article[2])
                        self.dbop.html2db(url, html,
                                          article[0],
                                          addtime,
                                          article[3],
                                          article[5])
                    else:
                        self.dbop.html2db(url, html)
                    print self.webpage.parse_links()
                    ruptn = self.get_patterns_from_rules(url)
                    links = self.webpage.filter_links(tags=['a'],
                                                      str_patterns=ruptn)
                    self.add_seeds(links)
                self.mysleep(3)
            except Exception, err:
               print "!!error!! Exception happend! %s %s" % (url, err)
               self.dbop.close()
Exemple #10
0
    def updateView(self):
        page = WebPage(logger=None, parent=self)
        page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks)
        page.mainFrame().addToJavaScriptWindowObject("qtWindow", self)
        self.ui.webView.setPage(page)

        html = self.generateHtml()
        # baseUrl must end with a trailing '/' otherwise QWebView won't be able
        # to load files from there
        baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/"))
        self.ui.webView.setHtml(html, baseUrl)
Exemple #11
0
    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_servers = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding,
                                       sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(args.proxy_type, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(
            self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings[
            'localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            self.m_page.mainFrame().evaluateJavaScript(f.readAll())

        do_action('PhantomInitPost')
Exemple #12
0
    def __init__(self, profile: QWebEngineProfile):
        super().__init__()

        self.m_addressBar = UrlLineEdit(self)
        self.m_view = WebView(self)
        self.setAttribute(Qt.WA_DeleteOnClose)
        self.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Minimum)

        layout = QVBoxLayout()
        # layout.setMargin(0)
        self.setLayout(layout)
        layout.addWidget(self.m_addressBar)
        layout.addWidget(self.m_view)

        self.m_view.setPage(WebPage(profile, self.m_view))
        self.m_view.setFocus()
        self.m_addressBar.setReadOnly(True)
        self.m_addressBar.setFavIcon(QIcon(":defaulticon.png"))

        self.m_view.titleChanged.connect(self.setWindowTitle)
        self.m_view.urlChanged.connect(self.setUrl)
        self.m_view.page().iconChanged.connect(self.handleIconChanged)
        self.m_view.page().geometryChangeRequested.connect(
            self.handleGeometryChangeRequested)
        self.m_view.page().windowCloseRequested.connect(self.close)
Exemple #13
0
 def __init__(self):
     logging.debug("-->")
     super(WebBrowser, self).__init__()
     self.app = QApplication.instance()
     if self.app is None:
         self.app = QApplication(sys.argv)
         self.app.setQuitOnLastWindowClosed(False)
     self.event_loop = QEventLoop()
     self.cookie_jar = CookieJar()
     self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888)
     self.network_manager = NetworkAccessManager() 
     self.network_manager.setCookieJar(self.cookie_jar)
     # self.network_manager.setProxy(self.proxy)
     self.web_page = WebPage()        
     self.web_page.setNetworkAccessManager(self.network_manager)
     self.web_view = QWebView()
     self.web_view.setPage(self.web_page)        
     self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False)
     self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
     self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
     # self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False)
     self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) 
     self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)
     self.page_loaded_validator = None
     self.page_loaded_handler = None
     self.page_loaded_handler_kwargs = None
     self.timeout_message = None
     self.timer = None
     self.event_loop_exception = None
     logging.debug("<--")
Exemple #14
0
 def getlinks(self,url,html):
     self.webpage = WebPage(url,html)
     self.webpage.parse_links()
     ruptn = self.get_patterns_from_rules(url)
     #print ruptn
     links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
     return links
Exemple #15
0
 def __init__(self, parent, url=''):
     super(WebView, self).__init__(parent)
     self.draging = False
     self.drag = QDrag(self)
     self.dragStartPos = None
     self.webPage = WebPage()
     self.setPage(self.webPage)
     self.mainFrame = self.page().mainFrame()
     self.setAttribute(Qt.WA_DeleteOnClose, True)
     self.titleChanged.connect(parent.setWindowTitle)
     self.load(url)
     webSettings = self.settings()
     webSettings.setDefaultTextEncoding("utf-8")
     # webSettings.setOfflineStorageDefaultQuota(sys.maxsize)
     # webSettings.setOfflineWebApplicationCacheQuota(sys.maxsize)
     webSettings.enablePersistentStorage(assets.fs.dataPath())
     webSettings.setAttribute(QWebSettings.PluginsEnabled, True)
     webSettings.setAttribute(QWebSettings.DnsPrefetchEnabled, True)
     webSettings.setAttribute(QWebSettings.XSSAuditingEnabled, True)
     webSettings.setAttribute(QWebSettings.CSSGridLayoutEnabled, True)
     webSettings.setAttribute(QWebSettings.ScrollAnimatorEnabled, True)
     webSettings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
     webSettings.setAttribute(QWebSettings.JavascriptCanOpenWindows, True)
     webSettings.setAttribute(QWebSettings.JavascriptCanCloseWindows, True)
     webSettings.setAttribute(QWebSettings.JavascriptCanAccessClipboard,
                              True)
     webSettings.setAttribute(QWebSettings.LocalContentCanAccessFileUrls,
                              True)
     webSettings.setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls,
                              True)
     self.mainFrame.javaScriptWindowObjectCleared.connect(
         self.setJavaScriptObject)
     self.mainFrame.iconChanged.connect(self.changeIcon)
Exemple #16
0
    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
    #        print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
 
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                #print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                print links
                self.add_seeds(links)
                file_pattern = []
                file_pattern.append(re.compile(self.file_rule))
                files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
                self.files.append(files)
                #TODO:
                self.download_files(files)
                print files
Exemple #17
0
 def test_file_links(self):
     self.start_server(TestWebPage.FILE_LINKS_HTML)
     webpage = WebPage(TestWebPage.SERVER)
     self.assertEqual(0, len(webpage.get_js()))
     self.assertEqual(0, len(webpage.get_stylesheets()))
     self.assertEqual(0, len(webpage.get_links()))
     self.assertEqual(2, len(webpage.get_anchors()))
     self.assertEqual(0, len(webpage.get_images()))
     self.assertEqual(2, len(webpage.get_files()))
    def __init__(self):
        super(Window, self).__init__()
        self.view = QWebView(self)
        self.view.setPage(WebPage())

        layout = QVBoxLayout(self)
        layout.setMargin(0)
        layout.addWidget(self.view)
Exemple #19
0
 def createWebPage(self):
     page = WebPage(self)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.setNetworkAccessManager(self.m_netAccessMan)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
Exemple #20
0
 def createTab(self, makeCurrent: bool = True) -> WebView:
     webView = WebView()
     webPage = WebPage(QWebEngineProfile.defaultProfile(), webView)
     webView.setPage(webPage)
     self.setupView(webView)
     self.addTab(webView, self.tr("(Untitled)"))
     if makeCurrent:
         self.setCurrentWidget(webView)
     return webView
Exemple #21
0
    def __init__(self, parent, args):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_scriptFile = args.script
        self.m_args = args.script_args

        self.m_filesystem = FileSystem(self)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(self, args.disk_cache, args.cookies, args.ignore_ssl_errors)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localAccessRemote'] = args.local_access_remote
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
        self.m_page.mainFrame().addToJavaScriptWindowObject('fs', self.m_filesystem)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            sys.exit('Can not bootstrap!')
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            sys.exit('Can not bootstrap!')
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost')
Exemple #22
0
    def __init__(self, args, parent = None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = ''
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = args.script.read()
        self.m_scriptFile = args.script.name
        self.m_scriptDir = os.path.dirname(args.script.name)
        if sys.platform.startswith('win'):
            self.m_scriptDir += '\\'
        else:
            self.m_scriptDir += '/'
        self.m_args = args.script_args
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script.close()

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
        self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
        self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)

        if self.m_verbose:
            m_netAccessMan = NetworkAccessManager(args.disk_cache, self)
            self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(self.inject)
        self.m_page.loadFinished.connect(self.finish)
Exemple #23
0
    def __init__(self, args, parent = None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = QString()
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = QString.fromUtf8(args.script[0].read())
        self.m_scriptFile = args.script[0].name
        self.m_args = args.script[1:]
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script[0].close()

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
        self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
        self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)

        # if our script was called in a different directory, change to it
        # to make any dealings with files be relative to the scripts directory
        if os.path.dirname(self.m_scriptFile):
            os.chdir(os.path.dirname(self.m_scriptFile))

        if self.m_verbose:
            m_netAccessMan = NetworkAccessManager(self)
            self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.connect(self.m_page.mainFrame(), SIGNAL('javaScriptWindowObjectCleared()'), self.inject)
        self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
Exemple #24
0
 def run(self):
     while self.status:
         try:
             url = self.spider.task_list.get(timeout = 1)
         except Empty:
             # log.info('%s: task_list Empty' % self.name)
             continue
         self.spider.increase_running()
         if not self.spider.check_robots(url):
             log.info('%s - robots forbidden : %s' % (self.name, url))
             continue
         page = WebPage(url)
         # print('%s prepare to fetch %s' % (self.name, url))
         if page.fetch():
             self.spider.db.save_data(page.get_data())
             for link in page.get_link():                        # retrive links from html
                 if link not in self.spider.visited_list:        # not visited yet
                     self.spider.extend_list.add(link)
         else:
             print('%s: Page fetch failed: %s' % (self.name, page.url))
         self.spider.decrease_running()
Exemple #25
0
    def get_course_manager(self, semester, departments, show_html=0):
        htmldata = self.get_course_page(semester, departments)

        if show_html:
            self.show_html(htmldata)

        htmlpath = "temp.html"
        with open(htmlpath, "w") as f:
            f.write(htmldata)
        webpage = WebPage(htmlpath=htmlpath)
        cm = CourseManager(webpage, logger=self.logger)
        return cm
Exemple #26
0
 def run(self):
     while self.status:
         try:
             url = self.spider.task_list.get(timeout=1)
         except Empty:
             # log.info('%s: task_list Empty' % self.name)
             continue
         self.spider.increase_running()
         if not self.spider.check_robots(url):
             log.info('%s - robots forbidden : %s' % (self.name, url))
             continue
         page = WebPage(url)
         # print('%s prepare to fetch %s' % (self.name, url))
         if page.fetch():
             self.spider.db.save_data(page.get_data())
             for link in page.get_link():  # retrive links from html
                 if link not in self.spider.visited_list:  # not visited yet
                     self.spider.extend_list.add(link)
         else:
             print('%s: Page fetch failed: %s' % (self.name, page.url))
         self.spider.decrease_running()
Exemple #27
0
 def createWebPage(self):
     page = WebPage(self)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.setNetworkAccessManager(self.m_netAccessMan)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
Exemple #28
0
class Browser(object):
    def __init__(self, settings):
        super(Browser, self).__init__()
        self._settings = settings
        self._cookie_jar = CookieJar()
        self._network_access_manager = NetworkAccessManager(self._cookie_jar)
        self._web_page = WebPage(settings)
        self._web_page.setNetworkAccessManager(self._network_access_manager)
        self._web_view = WebView(settings)
        self._web_view.setPage(self._web_page)

    def load_url(self, url, show_ui=True):
        self._web_view.load(QUrl(url))
        if show_ui is True:
            self._web_view.show()

    def load_html(self, html, url, cookies="", show_ui=True):
        if len(cookies) > 0:
            self._web_page.cookie_jar.load_qt_cookie(cookies)
        self._web_view.setHtml(html, QUrl(url))
        if show_ui is True:
            self._web_view.show()
Exemple #29
0
 def add_webpage(self):
     webpage = WebPage(name='', description='', url='', load_content=False)
     webpage.name = raw_input('Name: ')
     webpage.description = raw_input('Description: ')
     webpage.url = raw_input('URL: ')
     webpage.update_timeout = int(raw_input('Update timeout: '))
     webpage.request_timeout = int(raw_input('Request timeout: '))
     webpage.data_offset = int(raw_input('Data offset: '))
     done = False
     while not done:
         confirm = raw_input('Save? (y/n)')
         if confirm in ['y', 'Y']:
             try:
                 webpage.current = webpage.retrieve()
             except ValueError, e:
                 logger.error('[!] Error: ' + str(e))
                 done = True
             self.__webpages.append(webpage)
             #self.start_updater()
         if confirm in ['y', 'Y', 'n', 'N']:
             done = True
Exemple #30
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_script = args.script
        self.m_scriptFile = args.script_name
        self.m_args = args.script_args

        do_action('PhantomInitPre', Bunch(locals()))

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_page.applySettings(self.m_defaultPageSettings)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            qCritical('Can not bootstrap!')
            sys.exit(1)
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            qCritical('Can not bootstrap!')
            sys.exit(1)
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost', Bunch(locals()))
Exemple #31
0
    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            bootstrap = str(f.readAll())
        self.m_page.mainFrame().evaluateJavaScript(bootstrap)

        do_action('PhantomInitPost')
Exemple #32
0
    def updateView(self):
        page = WebPage(logger=None, parent=self)
        page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks)
        page.mainFrame().addToJavaScriptWindowObject("qtWindow", self)
        self.ui.webView.setPage(page)

        html = self.generateHtml()
        # baseUrl must end with a trailing '/' otherwise QWebView won't be able
        # to load files from there
        baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/"))
        self.ui.webView.setHtml(html, baseUrl)
def process_web_page(thread_id, sites, q):
    webpages = []

    for site in sites:
        try:
            w = WebPageLinkExtractor(WebPage(site))
            # Discard invalid webpages
            for web in w.external_links:
                if FORBIDDEN_DOMAINS.match(web.qdn) or\
                   FORBIDDEN_FILETYPES.match(web.url):
                    continue
                webpages.append(web)
            time.sleep(0.1)
        except:
            pass

    q.put(webpages)
Exemple #34
0
 def start(self):
     while 1:
         url = self.queue.pop_url()
         print url
         if url == None:
             print "crawling task is done."
             break
         error_msg, url, redirected_url, html = self.downloader.download(url)
         #print error_msg, url, redirected_url, html
         if html !=None:
             self.webpagedb.html2db(url,html)
             
             self.webpage = WebPage(url,html)
             self.webpage.parse_links()
             ruptn = self.get_patterns_from_rules(url)
             print ruptn
             links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
             self.add_seeds(links)
         self.mysleep(3)        
Exemple #35
0
 def start(self):
     while 1:
         url = self.queue.popUrl()
         print url
         if url == None:
             print "crawling task is done."
             break
         error_msg, url, redirected_url, html = self.downloader.download(url)
         #print error_msg, url, redirected_url, html
         if html !=None:
             self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来
             
             self.webpage = WebPage(url,html)#开始解析网页
             self.webpage.parseLinks()#得到全部的超链接
             ruptn = self.get_patterns_from_rules(url)
             print ruptn
             links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None
             if links:
                 self.addSeeds(links)
         self.mysleep(3)#休息一下再继续爬
Exemple #36
0
    def get_all_urls(self):
        """Get all building urls from all index pages.

        Returns:
            dict: {"page 1": [url1, url2, url3 ...], "page 2": [...], ...}
        """
        all_url_list = {}
        page_count = 0
        for page_url in self.get_all_page_urls():
            single_page_urls = []
            page_count += 1
            building_url_list = WebPage(page_url).get_soup().findAll(
                'a', {"se:clickable:target": "true"})
            for building in building_url_list:
                building_url = "https://streeteasy.com" + building.get("href")
                single_page_urls.append(building_url)
                IndexPage.url_count += 1
            all_url_list[f"page {page_count}"] = single_page_urls
            print(f"Retrived all url on page {page_count}")
        print("=============================")
        print(f"RETRIVED ALL URLS FOR {page_count} PAGES")
        print(f"TOTAL URLS IDENTIFIED: ", IndexPage.url_count)
        return all_url_list
Exemple #37
0
class Phantom(QObject):
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_scriptFile = args.script
        self.m_args = args.script_args

        do_action('PhantomInitPre', Bunch(locals()))

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            sys.exit('Can not bootstrap!')
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            sys.exit('Can not bootstrap!')
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost', Bunch(locals()))

    def execute(self):
        injectJsInFrame(self.m_scriptFile, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True)
        return not self.m_terminated

    def printConsoleMessage(self, message, lineNumber, source):
        if source:
            message = '%s:%d %s' % (source, lineNumber, message)
        print message

    def returnValue(self):
        return self.m_returnValue

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtSlot(result=WebPage)
    def createWebPage(self):
        page = WebPage(self)
        page.applySettings(self.m_defaultPageSettings)
        page.setNetworkAccessManager(self.m_netAccessMan)
        page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
        return page

    @pyqtProperty('QVariantMap')
    def defaultPageSettings(self):
        return self.m_defaultPageSettings

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_terminated = True
        self.m_returnValue = code

        # stop javascript execution; delete C++ object first,
        # then delete the Python reference
        sip.delete(self.m_page)
        del self.m_page

        QApplication.instance().exit(code)

    @pyqtSlot(str, result=bool)
    def injectJs(self, filePath):
        return injectJsInFrame(filePath, self.libraryPath, self.m_page.mainFrame())

    @pyqtProperty(str)
    def libraryPath(self):
        return self.m_page.libraryPath

    @libraryPath.setter
    def libraryPath(self, dirPath):
        self.m_page.libraryPath = dirPath

    @pyqtProperty(str)
    def scriptName(self):
        return os.path.basename(self.m_scriptFile)

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': version_major,
            'minor': version_minor,
            'patch': version_patch
        }
        return version

    do_action('Phantom', Bunch(locals()))
Exemple #38
0
class Phantom(QObject):
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = ''
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = args.script.read()
        self.m_scriptFile = args.script.name
        self.m_scriptDir = os.path.dirname(args.script.name) + '/'
        self.m_args = args.script_args
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script.close()

        do_action('PhantomInitPre', Bunch(locals()))

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
        self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
        self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)

        m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(self.inject)
        self.m_page.loadFinished.connect(self.finish)

        do_action('PhantomInitPost', Bunch(locals()))

    def execute(self):
        if self.m_script.startswith('#!'):
            self.m_script = '//' + self.m_script

        if self.m_scriptFile.lower().endswith('.coffee'):
            coffee = CSConverter(self)
            self.m_script = coffee.convert(self.m_script)

        self.m_page.mainFrame().evaluateJavaScript(self.m_script)

    def finish(self, success):
        self.m_loadStatus = 'success' if success else 'fail'
        self.m_page.mainFrame().evaluateJavaScript(self.m_script)

    def inject(self):
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

    def renderPdf(self, fileName):
        p = QPrinter()
        p.setOutputFormat(QPrinter.PdfFormat)
        p.setOutputFileName(fileName)
        p.setResolution(pdf_dpi)
        paperSize = self.m_paperSize

        if not len(paperSize):
            pageSize = QSize(self.m_page.mainFrame().contentsSize())
            paperSize['width'] = str(pageSize.width()) + 'px'
            paperSize['height'] = str(pageSize.height()) + 'px'
            paperSize['border'] = '0px'

        if paperSize.get('width') and paperSize.get('height'):
            sizePt = QSizeF(ceil(self.stringToPointSize(paperSize['width'])),
                            ceil(self.stringToPointSize(paperSize['height'])))
            p.setPaperSize(sizePt, QPrinter.Point)
        elif 'format' in paperSize:
            orientation = QPrinter.Landscape if paperSize.get('orientation') and paperSize['orientation'].lower() == 'landscape' else QPrinter.Portrait
            orientation = QPrinter.Orientation(orientation)
            p.setOrientation(orientation)

            formats = {
                'A3': QPrinter.A3,
                'A4': QPrinter.A4,
                'A5': QPrinter.A5,
                'Legal': QPrinter.Legal,
                'Letter': QPrinter.Letter,
                'Tabloid': QPrinter.Tabloid
            }

            p.setPaperSize(QPrinter.A4) # fallback
            for format, size in formats.items():
                if format.lower() == paperSize['format'].lower():
                    p.setPaperSize(size)
                    break
        else:
            return False

        border = floor(self.stringToPointSize(paperSize['border'])) if paperSize.get('border') else 0
        p.setPageMargins(border, border, border, border, QPrinter.Point)

        self.m_page.mainFrame().print_(p)
        return True

    def returnValue(self):
        return self.m_returnValue

    def stringToPointSize(self, string):
        units = (
            ('mm', 72 / 25.4),
            ('cm', 72 / 2.54),
            ('in', 72.0),
            ('px', 72.0 / pdf_dpi / 2.54),
            ('', 72.0 / pdf_dpi / 2.54)
        )

        for unit, format in units:
            if string.endswith(unit):
                value = string.rstrip(unit)
                return float(value) * format
        return 0

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtProperty('QVariantMap')
    def clipRect(self):
        result = {
            'width': self.m_clipRect.width(),
            'height': self.m_clipRect.height(),
            'top': self.m_clipRect.top(),
            'left': self.m_clipRect.left()
        }
        return result

    @clipRect.setter
    def clipRect(self, size):
        names = ('width', 'height', 'top', 'left')
        for item in names:
            try:
                globals()[item] = int(size[item])
                if globals()[item] < 0:
                    if item not in ('top', 'left'):
                        globals()[item] = 0
            except KeyError:
                globals()[item] = getattr(self.m_clipRect, item)()

        self.m_clipRect = QRect(left, top, width, height)

    @pyqtProperty(str)
    def content(self):
        return self.m_page.mainFrame().toHtml()

    @content.setter
    def content(self, content):
        self.m_page.mainFrame().setHtml(content)

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_returnValue = code
        self.m_page.loadFinished.disconnect(self.finish)
        QTimer.singleShot(0, qApp, SLOT('quit()'))

    @pyqtProperty(str)
    def loadStatus(self):
        return self.m_loadStatus

    @pyqtSlot(str, result=bool)
    def loadScript(self, script):
        if script in self.m_loadScript_cache:
            self.m_page.mainFrame().evaluateJavaScript(self.m_loadScript_cache[script])
            return True

        scriptFile = script
        try:
            script = codecs.open(self.m_scriptDir + script, encoding='utf-8')
            script = script.read()
        except IOError:
            return False

        if script.startswith('#!'):
            script = '//' + script

        if scriptFile.lower().endswith('.coffee'):
            coffee = CSConverter(self)
            script = coffee.convert(script)

        self.m_loadScript_cache[scriptFile] = script
        self.m_page.mainFrame().evaluateJavaScript(script)
        return True

    @pyqtSlot(str, name='open')
    def open_(self, address):
        qDebug('Opening address %s' % address)
        self.m_page.triggerAction(QWebPage.Stop)
        self.m_loadStatus = 'loading'
        self.m_page.mainFrame().setUrl(QUrl(address))

    @pyqtProperty('QVariantMap')
    def paperSize(self):
        return self.m_paperSize

    @paperSize.setter
    def paperSize(self, size):
        self.m_paperSize = size

    @pyqtSlot(str, result=bool)
    def render(self, fileName):
        fileInfo = QFileInfo(fileName)
        path = QDir()
        path.mkpath(fileInfo.absolutePath())

        if fileName.lower().endswith('.pdf'):
            return self.renderPdf(fileName)

        viewportSize = QSize(self.m_page.viewportSize())
        pageSize = QSize(self.m_page.mainFrame().contentsSize())

        bufferSize = QSize()
        if not self.m_clipRect.isEmpty():
            bufferSize = self.m_clipRect.size()
        else:
            bufferSize = self.m_page.mainFrame().contentsSize()

        if pageSize == '':
            return False

        image = QImage(bufferSize, QImage.Format_ARGB32)
        image.fill(qRgba(255, 255, 255, 0))
        p = QPainter(image)

        p.setRenderHint(QPainter.Antialiasing, True)
        p.setRenderHint(QPainter.TextAntialiasing, True)
        p.setRenderHint(QPainter.SmoothPixmapTransform, True)

        self.m_page.setViewportSize(pageSize)

        if not self.m_clipRect.isEmpty():
            p.translate(-self.m_clipRect.left(), -self.m_clipRect.top())
            self.m_page.mainFrame().render(p, QRegion(self.m_clipRect))
        else:
            self.m_page.mainFrame().render(p)

        p.end()
        self.m_page.setViewportSize(viewportSize)
        return image.save(fileName)

    @pyqtSlot('QWebElement', str)
    def setFormInputFile(self, el, fileTag):
        self.m_page.m_nextFileTag = fileTag
        el.evaluateJavaScript('''(function(target){
                              var evt = document.createEvent('MouseEvents');
                              evt.initMouseEvent("click", true, true, window,
                              0, 0, 0, 0, 0, false, false, false, false, 0, null);
                              target.dispatchEvent(evt);})(this);''')

    @pyqtSlot(int)
    def sleep(self, ms):
        startTime = QTime.currentTime()
        while True:
            QApplication.processEvents(QEventLoop.AllEvents, 25)
            if startTime.msecsTo(QTime.currentTime()) > ms:
                break
            usleep(0.005)

    @pyqtProperty(str)
    def state(self):
        return self.m_state

    @state.setter
    def state(self, value):
        self.m_state = value

    @pyqtProperty(str)
    def userAgent(self):
        return self.m_page.m_userAgent

    @userAgent.setter
    def userAgent(self, ua):
        self.m_page.m_userAgent = ua

    @pyqtSlot(str, result='QVariant')
    @pyqtSlot(int, result='QVariant')
    @pyqtSlot(str, 'QVariant')
    @pyqtSlot(int, 'QVariant')
    def ctx(self, name, value=None):
        if not value:
            return self.m_var.get(name)
        self.m_var[name] = value

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': version_major,
            'minor': version_minor,
            'patch': version_patch
        }
        return version

    @pyqtProperty('QVariantMap')
    def viewportSize(self):
        size = self.m_page.viewportSize()
        result = {
            'width': size.width(),
            'height': size.height()
        }
        return result

    @viewportSize.setter
    def viewportSize(self, size):
        names = ('width', 'height')
        for item in names:
            try:
                globals()[item] = int(size[item])
                if globals()[item] < 0:
                    globals()[item] = 0
            except KeyError:
                globals()[item] = getattr(self.m_page.viewportSize(), item)()

        self.m_page.setViewportSize(QSize(width, height))

    do_action('Phantom', Bunch(locals()))
Exemple #39
0
class Phantom(QObject):
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_scriptFile = args.script
        self.m_args = args.script_args

        self.m_filesystem = FileSystem(self)
        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache,
                                                   args.ignore_ssl_errors,
                                                   self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(
            self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings[
            'localAccessRemote'] = args.local_access_remote
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
        self.m_page.mainFrame().addToJavaScriptWindowObject(
            'fs', self.m_filesystem)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            sys.exit('Can not bootstrap!')
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            sys.exit('Can not bootstrap!')
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost')

    def execute(self):
        injectJsInFrame(self.m_scriptFile,
                        os.path.dirname(os.path.abspath(__file__)),
                        self.m_page.mainFrame(), True)
        return not self.m_terminated

    def printConsoleMessage(self, message, lineNumber, source):
        if source:
            message = '%s:%d %s' % (source, lineNumber, message)
        print message

    def returnValue(self):
        return self.m_returnValue

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtSlot(WebPage)
    def _destroy(self, page):
        self.m_pages.remove(page)
        sip.delete(page)

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtSlot(result=WebPage)
    def createWebPage(self):
        page = WebPage(self)
        self.m_pages.append(page)
        page.applySettings(self.m_defaultPageSettings)
        page.setNetworkAccessManager(self.m_netAccessMan)
        page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
        return page

    @pyqtProperty('QVariantMap')
    def defaultPageSettings(self):
        return self.m_defaultPageSettings

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_terminated = True
        self.m_returnValue = code

        # stop javascript execution in start script;
        # delete all the pages C++ objects, then clear
        # the page list, and empty the Phantom page
        for page in self.m_pages:
            sip.delete(page)
        del self.m_pages[:]
        self.m_page = None

        QApplication.instance().exit(code)

    @pyqtSlot(str, result=bool)
    def injectJs(self, filePath):
        return injectJsInFrame(filePath, self.libraryPath,
                               self.m_page.mainFrame())

    @pyqtProperty(str)
    def libraryPath(self):
        return self.m_page.libraryPath

    @libraryPath.setter
    def libraryPath(self, dirPath):
        self.m_page.libraryPath = dirPath

    @pyqtProperty(str)
    def scriptName(self):
        return os.path.basename(self.m_scriptFile)

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': version_major,
            'minor': version_minor,
            'patch': version_patch
        }
        return version

    do_action('Phantom')
Exemple #40
0
class Phantom(QObject):
    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            bootstrap = str(f.readAll())
        self.m_page.mainFrame().evaluateJavaScript(bootstrap)

        do_action('PhantomInitPost')

    def execute(self):
        injectJsInFrame(self.m_scriptFile, self.m_scriptEncoding.encoding, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True)
        return not self.m_terminated

    def printConsoleMessage(self, message, lineNumber, source):
        if source:
            message = '%s:%d %s' % (source, lineNumber, message)
        print message

    def returnValue(self):
        return self.m_returnValue

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtSlot(result=FileSystem)
    def createFilesystem(self):
        return FileSystem(self)

    @pyqtSlot(result=WebPage)
    def createWebPage(self):
        page = WebPage(self, self.app_args)
        self.m_pages.append(page)
        page.applySettings(self.m_defaultPageSettings)
        page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
        return page

    @pyqtProperty('QVariantMap')
    def defaultPageSettings(self):
        return self.m_defaultPageSettings

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_terminated = True
        self.m_returnValue = code

        # stop javascript execution in start script;
        # delete all the pages C++ objects, then clear
        # the page list, and empty the Phantom page
        for page in self.m_pages:
            sip.delete(page)
        del self.m_pages[:]
        self.m_page = None

        QApplication.instance().exit(code)

    @pyqtSlot(str, result=bool)
    def injectJs(self, filePath):
        return injectJsInFrame(filePath, self.m_scriptEncoding.encoding, self.libraryPath, self.m_page.mainFrame())

    @pyqtSlot(str, result=str)
    def loadModuleSource(self, name):
        moduleSourceFilePath = ':/modules/%s.js' % name

        with QPyFile(moduleSourceFilePath) as f:
            moduleSource = str(f.readAll())

        return moduleSource

    @pyqtProperty(str)
    def libraryPath(self):
        return self.m_page.libraryPath

    @libraryPath.setter
    def libraryPath(self, dirPath):
        self.m_page.libraryPath = dirPath

    @pyqtProperty(str)
    def outputEncoding(self):
        return self.m_outputEncoding.name

    @outputEncoding.setter
    def outputEncoding(self, encoding):
        self.m_outputEncoding = Encode(encoding, self.m_outputEncoding.encoding)

        sys.stdout.encoding = self.m_outputEncoding.encoding
        sys.stdout.encode_to = self.m_outputEncoding.encoding
        sys.stderr.encoding = self.m_outputEncoding.encoding
        sys.stdout.encode_to = self.m_outputEncoding.encoding

    @pyqtProperty(str)
    def scriptName(self):
        return os.path.basename(self.m_scriptFile)

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': __version_info__[0],
            'minor': __version_info__[1],
            'patch': __version_info__[2]
        }
        return version

    do_action('Phantom')
Exemple #41
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = ''
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = args.script.read()
        self.m_scriptFile = args.script.name
        self.m_scriptDir = os.path.dirname(args.script.name) + '/'
        self.m_args = args.script_args
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script.close()

        do_action('PhantomInitPre', Bunch(locals()))

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages,
                                            autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled,
                                            pluginsEnabled)
        self.m_page.settings().setAttribute(
            QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(
            QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled,
                                            True)
        self.m_page.settings().setLocalStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal,
                                                   Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical,
                                                   Qt.ScrollBarAlwaysOff)

        m_netAccessMan = NetworkAccessManager(args.disk_cache,
                                              args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(
            self.inject)
        self.m_page.loadFinished.connect(self.finish)

        do_action('PhantomInitPost', Bunch(locals()))
Exemple #42
0
            error_msg = "lxml error"
        return error_msg, url, redirected_url, html
    """
        
if __name__ == "__main__":
    #'''
    url = "http://www.cnbeta.com/"
    downloader = DownloadManager()
    error_msg, url, redirected_url, html = downloader.download(url)
    print "error_msg=%s" %error_msg
    print "url=%s" %url
    print "redirected_url=%s" %redirected_url
    f = open("www.cnbeta.com.html",'w')
    f.write(html)
    f.close()
    webpage = WebPage(url, html)
    webpage.parse_links()

    website = 'cnbeta\.com'
    patnstr = '^(http|https)://(.*\.' + website + ')(.+)$';
    links = webpage.filter_links(tags=['a'], str_patterns=[patnstr])
    links.sort()

    f_filter_links = open('filter_links_cnbeta.txt', 'w')

    #print links
    f = open('links_regged_cnbeta.txt','w')
    for link in links:
        f_filter_links.write('%s\n' % link)
        f.write('%s\n' % link)
        for elem, attr, lnk, pos in webpage.doc.iterlinks():
Exemple #43
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}
        self.files = []
        self.file_rule = ".+"

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
        self.repodb = RepoStateDB()   
 
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def set_file_rule(self, rule):
        self.file_rule = rule

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))
    
    def download_files(self, files):
        for f in files:
            #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir
            cmd = "wget -c " + f + " -P " + config.repos_dir
            ret_code = os.system(cmd)
            self.repodb.update(f, ret_code == 0)

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
    #        print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
 
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                #print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                print links
                self.add_seeds(links)
                file_pattern = []
                file_pattern.append(re.compile(self.file_rule))
                files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
                self.files.append(files)
                #TODO:
                self.download_files(files)
                print files

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
Exemple #44
0
class Phantom(QObject):
    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            bootstrap = f.readAll()
        self.m_page.mainFrame().evaluateJavaScript(bootstrap)

        do_action('PhantomInitPost')

    def execute(self):
        injectJsInFrame(self.m_scriptFile, self.m_scriptEncoding.encoding, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True)
        return not self.m_terminated

    def printConsoleMessage(self, message, lineNumber, source):
        if source:
            message = '%s:%d %s' % (source, lineNumber, message)
        print message

    def returnValue(self):
        return self.m_returnValue

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtSlot(result=FileSystem)
    def createFilesystem(self):
        return FileSystem(self)

    @pyqtSlot(result=WebPage)
    def createWebPage(self):
        page = WebPage(self, self.app_args)
        self.m_pages.append(page)
        page.applySettings(self.m_defaultPageSettings)
        page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
        return page

    @pyqtProperty('QVariantMap')
    def defaultPageSettings(self):
        return self.m_defaultPageSettings

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_terminated = True
        self.m_returnValue = code

        # stop javascript execution in start script;
        # delete all the pages C++ objects, then clear
        # the page list, and empty the Phantom page
        for page in self.m_pages:
            sip.delete(page)
        del self.m_pages[:]
        self.m_page = None

        QApplication.instance().exit(code)

    @pyqtSlot(str, result=bool)
    def injectJs(self, filePath):
        return injectJsInFrame(filePath, self.m_scriptEncoding.encoding, self.libraryPath, self.m_page.mainFrame())

    @pyqtSlot(str, result=str)
    def loadModuleSource(self, name):
        moduleSourceFilePath = ':/modules/%s.js' % name

        with QPyFile(moduleSourceFilePath) as f:
            moduleSource = f.readAll()

        return moduleSource

    @pyqtProperty(str)
    def libraryPath(self):
        return self.m_page.libraryPath

    @libraryPath.setter
    def libraryPath(self, dirPath):
        self.m_page.libraryPath = dirPath

    @pyqtProperty(str)
    def outputEncoding(self):
        return self.m_outputEncoding.name

    @outputEncoding.setter
    def outputEncoding(self, encoding):
        self.m_outputEncoding = Encode(encoding, self.m_outputEncoding.encoding)

        sys.stdout.encoding = self.m_outputEncoding.encoding
        sys.stdout.encode_to = self.m_outputEncoding.encoding
        sys.stderr.encoding = self.m_outputEncoding.encoding
        sys.stdout.encode_to = self.m_outputEncoding.encoding

    @pyqtProperty(str)
    def scriptName(self):
        return os.path.basename(self.m_scriptFile)

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': __version_info__[0],
            'minor': __version_info__[1],
            'patch': __version_info__[2]
        }
        return version

    do_action('Phantom')
Exemple #45
0
 def createWebPage(self):
     page = WebPage(self, self.app_args)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
Exemple #46
0
class Crawler():

    def __init__(self):
        self.downloader = DownloadManager()#下载网页的对象
        self.webpage = None#解析页面的对象
        self.initDatabase()
        self.rules = {}

    #初始化数据库
    def initDatabase(self):
        self.queue = QueueDB()#TODO 表
        self.webpagedb = WebpageDB()
        self.duplcheck = DuplCheckDB()
    
    #增加种子url
    #参数: links   url 列表
    def addSeeds(self, links):
        new_links = self.duplcheck.filterDuplUrls(links)#把重复的url过滤掉
        self.duplcheck.addUrls(new_links)#已经访问过的url
        self.queue.pushUrls(new_links)#向todo表中增加url
    
    def addRules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    #开始执行
    def start(self):
        while 1:
            url = self.queue.popUrl()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来
                
                self.webpage = WebPage(url,html)#开始解析网页
                self.webpage.parseLinks()#得到全部的超链接
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None
                if links:
                    self.addSeeds(links)
            self.mysleep(3)#休息一下再继续爬

    def mysleep(self, n):
        for i in range(1,n+1):
            time.sleep(1)
            print "sleep",i,"of",n
Exemple #47
0
def OnMouseDown(event, arg, login, password, instr, flag, link, canvas):
    words = []
    for i in xrange(len(arg)):
        if (arg[i].get() != ""):
            words += [arg[i].get()]

    if flag.get() == 1:
        # only to set the link
        page = WebPage(words)
        link.set(page.link)

        # get data locally
        globalFile = open('data.csv', 'r')
        globalC = globalFile.read()

        globalCon = ""
        for line in globalC:
            globalCon += line

        regex = '[0-9]+-.*?-.*?'
        for i in xrange(len(words)):
            regex += ',[0-9]+'
        globalInfo = re.findall(regex,globalCon)

        regex = ""
        for i in xrange(len(words)):
            regex += ',[0-9]+'

        globalData = []
        for i in xrange(len(globalInfo)):
            temp = re.findall(regex,globalInfo[i])
            num = re.findall(r'\d+',temp[0])
            globalData += num
        # read individual
        allwords = []
        for i in xrange(len(words)):
            string = words[i] + ".csv"
            fle = open(string, 'r')
            file1 = fle.read()
            con = ""
            for line in file1:
                con += line
            info = re.findall(r'[0-9]+-.*?-.*?,[0-9]+',con)
            data = []
            for j in xrange(len(info)):
                temp = re.findall(',[0-9]+',info[j])
                num = re.findall(r'\d+',temp[0])
                data += num
            allwords.append(data)

        # output
        output = open('Java/LetTheDataSpeak/data.txt','w')
        for i in xrange(len(allwords[0])):
            for j in xrange(len(allwords)):
                output.write(allwords[j][i] + " " + globalData[i*len(words)+j] + " ")
            output.write("\n")

    else:
        # get data from website
        page = WebPage(words)
        link.set(page.link)

        page.getContent(login,password)
        # get global data
        globalFile = open('data.csv', 'r')
        globalC = globalFile.read()

        globalCon = ""
        for line in globalC:
            globalCon += line

        regex = '[0-9]+-.*?-.*?'
        for i in xrange(len(words)):
            regex += ',[0-9]+'
        globalInfo = re.findall(regex,globalCon)

        regex = ""
        for i in xrange(len(words)):
            regex += ',[0-9]+'
        globalData = []
        for i in xrange(len(globalInfo)):
            temp = re.findall(regex,globalInfo[i])
            num = re.findall(r'\d+',temp[0])
            globalData += num

        for i in xrange(len(words)):
            page = WebPage([words[i]])
            link.set(page.link)
            page.getContent(login,password)

        # rename 
        for i in xrange(len(words)):
            bashCommand = "rename "
            bashCommand += "data(" + str(i+1) + ").csv " + words[i] + ".csv"
            process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
            output = process.communicate()[0]

        # read individual
        allwords = []
        for i in xrange(len(words)):
            string = words[i] + ".csv"
            fle = open(string, 'r')
            file1 = fle.read()
            con = ""
            for line in file1:
                con += line
            info = re.findall(r'[0-9]+-.*?-.*?,[0-9]+',con)
            data = []
            for j in xrange(len(info)):
                temp = re.findall(',[0-9]+',info[j])
                num = re.findall(r'\d+',temp[0])
                data += num
            allwords.append(data)

        # output
        output = open('Java/LetTheDataSpeak/data.txt','w')
        for i in xrange(len(allwords[0])):
            for j in xrange(len(allwords)):
                output.write(allwords[j][i] + " " + globalData[i*len(words)+j] + " ")
            output.write("\n")

    canvas.create_line(0,114,520,114, fill="black", width = 3)
    color = ["coral", "yellow", "green", "blue", "red"]
    # visualisation
    for i in xrange(len(words)):
        for j in xrange(len(allwords[0])):
            if (j<520):
                canvas.create_line(j,110-int(allwords[i][j]),j+1,110-int(allwords[i][j+1]),
                    fill=color[i], width=1)

    import time
    time.sleep(5)

    ARGS = "ARGS="
    for i in xrange(len(words)):
        ARGS += instr[i].get() + ' '

    ARGS += ""
    print ARGS
    sys.stdout.flush()
    subprocess.call(["make", ARGS])
Exemple #48
0
class Crawler(object):
    def __init__(self):
        super(Crawler, self).__init__()
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')

    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def getlinks(self, url, html):
        self.webpage = WebPage(url, html)
        self.webpage.parse_links()
        ruptn = self.get_patterns_from_rules(url)
        #print ruptn
        links = self.webpage.filter_links(tags=['a'], patterns=ruptn)
        return links

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(
                url)
            #print error_msg, url, redirected_url, html
            if html != None:
                self.webpagedb.html2db(url, html)
                links = self.getlinks(url, html)
                self.add_seeds(links)
            self.mysleep(3)

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep", i, "of", n
Exemple #49
0
 def get(self, url):
     r = requests.get(self.baseUrl + url)
     return WebPage(r.text)
        print("Usage: {} <source_url> [max num of internal_links]".format(
            sys.argv[0]))
        sys.exit(0)

    source_url = sys.argv[1]
    max_sites = float('inf')
    if len(sys.argv) == 3:
        max_sites = int(sys.argv[2])

    to_process = [source_url]
    visited = set()
    internal_links = []

    while len(to_process):
        url = to_process.pop()
        w = WebPageLinkExtractor(WebPage(url))

        if len(visited) >= max_sites:
            break

        for web in w.internal_links:
            index = web.url.find('#')
            if index != -1:
                web.url = web.url[:index]
            if web.url not in visited:
                visited.add(web.url)
                to_process.append(web.url)

        time.sleep(0.3)

    for link in visited:
Exemple #51
0
 def createWebPage(self):
     page = WebPage(self)
     page.applySettings(self.m_defaultPageSettings)
     page.setNetworkAccessManager(self.m_netAccessMan)
     page.scriptLookupDir = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
Exemple #52
0
class Phantom(QObject):
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = ''
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = args.script.read()
        self.m_scriptFile = args.script.name
        self.m_scriptDir = os.path.dirname(args.script.name) + '/'
        self.m_args = args.script_args
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script.close()

        do_action('PhantomInitPre', Bunch(locals()))

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages,
                                            autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled,
                                            pluginsEnabled)
        self.m_page.settings().setAttribute(
            QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(
            QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled,
                                            True)
        self.m_page.settings().setLocalStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal,
                                                   Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical,
                                                   Qt.ScrollBarAlwaysOff)

        m_netAccessMan = NetworkAccessManager(args.disk_cache,
                                              args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(
            self.inject)
        self.m_page.loadFinished.connect(self.finish)

        do_action('PhantomInitPost', Bunch(locals()))

    def execute(self):
        if self.m_script.startswith('#!'):
            self.m_script = '//' + self.m_script

        if self.m_scriptFile.lower().endswith('.coffee'):
            coffee = CSConverter(self)
            self.m_script = coffee.convert(self.m_script)

        self.m_page.mainFrame().evaluateJavaScript(self.m_script)

    def finish(self, success):
        self.m_loadStatus = 'success' if success else 'fail'
        self.m_page.mainFrame().evaluateJavaScript(self.m_script)

    def inject(self):
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

    def renderPdf(self, fileName):
        p = QPrinter()
        p.setOutputFormat(QPrinter.PdfFormat)
        p.setOutputFileName(fileName)
        p.setResolution(pdf_dpi)
        paperSize = self.m_paperSize

        if not len(paperSize):
            pageSize = QSize(self.m_page.mainFrame().contentsSize())
            paperSize['width'] = str(pageSize.width()) + 'px'
            paperSize['height'] = str(pageSize.height()) + 'px'
            paperSize['border'] = '0px'

        if paperSize.get('width') and paperSize.get('height'):
            sizePt = QSizeF(ceil(self.stringToPointSize(paperSize['width'])),
                            ceil(self.stringToPointSize(paperSize['height'])))
            p.setPaperSize(sizePt, QPrinter.Point)
        elif 'format' in paperSize:
            orientation = QPrinter.Landscape if paperSize.get(
                'orientation') and paperSize['orientation'].lower(
                ) == 'landscape' else QPrinter.Portrait
            orientation = QPrinter.Orientation(orientation)
            p.setOrientation(orientation)

            formats = {
                'A3': QPrinter.A3,
                'A4': QPrinter.A4,
                'A5': QPrinter.A5,
                'Legal': QPrinter.Legal,
                'Letter': QPrinter.Letter,
                'Tabloid': QPrinter.Tabloid
            }

            p.setPaperSize(QPrinter.A4)  # fallback
            for format, size in formats.items():
                if format.lower() == paperSize['format'].lower():
                    p.setPaperSize(size)
                    break
        else:
            return False

        border = floor(self.stringToPointSize(
            paperSize['border'])) if paperSize.get('border') else 0
        p.setPageMargins(border, border, border, border, QPrinter.Point)

        self.m_page.mainFrame().print_(p)
        return True

    def returnValue(self):
        return self.m_returnValue

    def stringToPointSize(self, string):
        units = (('mm', 72 / 25.4), ('cm', 72 / 2.54), ('in', 72.0),
                 ('px', 72.0 / pdf_dpi / 2.54), ('', 72.0 / pdf_dpi / 2.54))

        for unit, format in units:
            if string.endswith(unit):
                value = string.rstrip(unit)
                return float(value) * format
        return 0

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtProperty('QVariantMap')
    def clipRect(self):
        result = {
            'width': self.m_clipRect.width(),
            'height': self.m_clipRect.height(),
            'top': self.m_clipRect.top(),
            'left': self.m_clipRect.left()
        }
        return result

    @clipRect.setter
    def clipRect(self, size):
        names = ('width', 'height', 'top', 'left')
        for item in names:
            try:
                globals()[item] = int(size[item])
                if globals()[item] < 0:
                    if item not in ('top', 'left'):
                        globals()[item] = 0
            except KeyError:
                globals()[item] = getattr(self.m_clipRect, item)()

        self.m_clipRect = QRect(left, top, width, height)

    @pyqtProperty(str)
    def content(self):
        return self.m_page.mainFrame().toHtml()

    @content.setter
    def content(self, content):
        self.m_page.mainFrame().setHtml(content)

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_returnValue = code
        self.m_page.loadFinished.disconnect(self.finish)
        QTimer.singleShot(0, qApp, SLOT('quit()'))

    @pyqtProperty(str)
    def loadStatus(self):
        return self.m_loadStatus

    @pyqtSlot(str, result=bool)
    def loadScript(self, script):
        if script in self.m_loadScript_cache:
            self.m_page.mainFrame().evaluateJavaScript(
                self.m_loadScript_cache[script])
            return True

        scriptFile = script
        try:
            script = codecs.open(self.m_scriptDir + script, encoding='utf-8')
            script = script.read()
        except IOError:
            return False

        if script.startswith('#!'):
            script = '//' + script

        if scriptFile.lower().endswith('.coffee'):
            coffee = CSConverter(self)
            script = coffee.convert(script)

        self.m_loadScript_cache[scriptFile] = script
        self.m_page.mainFrame().evaluateJavaScript(script)
        return True

    @pyqtSlot(str, name='open')
    def open_(self, address):
        qDebug('Opening address %s' % address)
        self.m_page.triggerAction(QWebPage.Stop)
        self.m_loadStatus = 'loading'
        self.m_page.mainFrame().setUrl(QUrl(address))

    @pyqtProperty('QVariantMap')
    def paperSize(self):
        return self.m_paperSize

    @paperSize.setter
    def paperSize(self, size):
        self.m_paperSize = size

    @pyqtSlot(str, result=bool)
    def render(self, fileName):
        fileInfo = QFileInfo(fileName)
        path = QDir()
        path.mkpath(fileInfo.absolutePath())

        if fileName.lower().endswith('.pdf'):
            return self.renderPdf(fileName)

        viewportSize = QSize(self.m_page.viewportSize())
        pageSize = QSize(self.m_page.mainFrame().contentsSize())

        bufferSize = QSize()
        if not self.m_clipRect.isEmpty():
            bufferSize = self.m_clipRect.size()
        else:
            bufferSize = self.m_page.mainFrame().contentsSize()

        if pageSize == '':
            return False

        image = QImage(bufferSize, QImage.Format_ARGB32)
        image.fill(qRgba(255, 255, 255, 0))
        p = QPainter(image)

        p.setRenderHint(QPainter.Antialiasing, True)
        p.setRenderHint(QPainter.TextAntialiasing, True)
        p.setRenderHint(QPainter.SmoothPixmapTransform, True)

        self.m_page.setViewportSize(pageSize)

        if not self.m_clipRect.isEmpty():
            p.translate(-self.m_clipRect.left(), -self.m_clipRect.top())
            self.m_page.mainFrame().render(p, QRegion(self.m_clipRect))
        else:
            self.m_page.mainFrame().render(p)

        p.end()
        self.m_page.setViewportSize(viewportSize)
        return image.save(fileName)

    @pyqtSlot('QWebElement', str)
    def setFormInputFile(self, el, fileTag):
        self.m_page.m_nextFileTag = fileTag
        el.evaluateJavaScript('''(function(target){
                              var evt = document.createEvent('MouseEvents');
                              evt.initMouseEvent("click", true, true, window,
                              0, 0, 0, 0, 0, false, false, false, false, 0, null);
                              target.dispatchEvent(evt);})(this);''')

    @pyqtSlot(int)
    def sleep(self, ms):
        startTime = QTime.currentTime()
        while True:
            QApplication.processEvents(QEventLoop.AllEvents, 25)
            if startTime.msecsTo(QTime.currentTime()) > ms:
                break
            usleep(0.005)

    @pyqtProperty(str)
    def state(self):
        return self.m_state

    @state.setter
    def state(self, value):
        self.m_state = value

    @pyqtProperty(str)
    def userAgent(self):
        return self.m_page.m_userAgent

    @userAgent.setter
    def userAgent(self, ua):
        self.m_page.m_userAgent = ua

    @pyqtSlot(str, result='QVariant')
    @pyqtSlot(int, result='QVariant')
    @pyqtSlot(str, 'QVariant')
    @pyqtSlot(int, 'QVariant')
    def ctx(self, name, value=None):
        if not value:
            return self.m_var.get(name)
        self.m_var[name] = value

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': version_major,
            'minor': version_minor,
            'patch': version_patch
        }
        return version

    @pyqtProperty('QVariantMap')
    def viewportSize(self):
        size = self.m_page.viewportSize()
        result = {'width': size.width(), 'height': size.height()}
        return result

    @viewportSize.setter
    def viewportSize(self, size):
        names = ('width', 'height')
        for item in names:
            try:
                globals()[item] = int(size[item])
                if globals()[item] < 0:
                    globals()[item] = 0
            except KeyError:
                globals()[item] = getattr(self.m_page.viewportSize(), item)()

        self.m_page.setViewportSize(QSize(width, height))

    do_action('Phantom', Bunch(locals()))
Exemple #53
0
class WebBrowser(QObject):    
    def __init__(self):
        logging.debug("-->")
        super(WebBrowser, self).__init__()
        self.app = QApplication.instance()
        if self.app is None:
            self.app = QApplication(sys.argv)
            self.app.setQuitOnLastWindowClosed(False)
        self.event_loop = QEventLoop()
        self.cookie_jar = CookieJar()
        self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888)
        self.network_manager = NetworkAccessManager() 
        self.network_manager.setCookieJar(self.cookie_jar)
        # self.network_manager.setProxy(self.proxy)
        self.web_page = WebPage()        
        self.web_page.setNetworkAccessManager(self.network_manager)
        self.web_view = QWebView()
        self.web_view.setPage(self.web_page)        
        self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False)
        self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
        self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
        # self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False)
        self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) 
        self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)
        self.page_loaded_validator = None
        self.page_loaded_handler = None
        self.page_loaded_handler_kwargs = None
        self.timeout_message = None
        self.timer = None
        self.event_loop_exception = None
        logging.debug("<--")
                
    def network_reply_finished(self,reply):
        logging.debug("Reply received for: " + reply.request().url().toString())
        self.network_manager.request_queue[reply.request().url()] = "Completed"
        redirect_url = self.get_redirect_url(reply.attribute(QNetworkRequest.RedirectionTargetAttribute),reply.request().url())
        if redirect_url is not None:
            self.redirect(redirect_url,reply.request())
            
    def redirect(self,url,request):
        frame = self.find_frame_to_redirect(self.web_view.page().mainFrame(),request)
        if frame is not None:
            logging.debug("Redirecting to: " + url.toString())
            frame.load(url)                
    
    def find_frame_to_redirect(self,frame,request):
        if frame.requestedUrl() == request.url():
            return frame
        else:
            children = frame.childFrames()
            for child in children:
                frame_to_redirect = self.find_frame_to_redirect(child,request)
                if frame_to_redirect is not None:
                    return frame_to_redirect
            
    def get_redirect_url(self,possible_redirect_url, orig_requested_url):
        if possible_redirect_url is not None:
            if possible_redirect_url.isRelative():
                if orig_requested_url.isRelative():
                    return None
                possible_redirect_url.setScheme(orig_requested_url.scheme())
                possible_redirect_url.setHost(orig_requested_url.host())
            if orig_requested_url != possible_redirect_url:
                return possible_redirect_url
        
    def get_cookies(self):
        cookies = self.cookie_jar.allCookies()
        raw_cookies = []
        first = True
        for cookie in cookies:
            raw_cookies.append(cookie.toRawForm())
        return raw_cookies
        
    def set_cookies(self,raw_cookies):
        cookies = []
        for raw_cookie in raw_cookies:
            cookie_list = QNetworkCookie.parseCookies(raw_cookie)
            for cookie in cookie_list:
                cookies.append(cookie)
        self.cookie_jar.setAllCookies(cookies)
            
    def cleanup(self):
        logging.debug("-->")
        self.disconnect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)        
        self.web_view.setParent(None)
        self.web_page.setParent(None)
        self.network_manager.setParent(None)        
        self.event_loop.setParent(None)
        self.setParent(None)        
        del self.web_view
        del self.web_page
        del self.network_manager
        del self.event_loop
        del self.app
        logging.debug("<--")
Exemple #54
0
 def createWebPage(self):
     page = WebPage(self, self.app_args)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
Exemple #55
0
class Phantom(QObject):
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_script = args.script
        self.m_scriptFile = args.script_name
        self.m_args = args.script_args

        do_action('PhantomInitPre', Bunch(locals()))

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_page.applySettings(self.m_defaultPageSettings)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            qCritical('Can not bootstrap!')
            sys.exit(1)
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            qCritical('Can not bootstrap!')
            sys.exit(1)
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost', Bunch(locals()))

    def execute(self):
        if self.m_scriptFile.lower().endswith('.coffee'):
            coffee = CSConverter(self)
            self.m_script = coffee.convert(self.m_script)

        if self.m_script.startswith('#!'):
            self.m_script = '//' + self.m_script

        self.m_page.mainFrame().evaluateJavaScript(self.m_script)
        return not self.m_terminated

    def printConsoleMessage(self, msg):
        print msg

    def returnValue(self):
        return self.m_returnValue

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtSlot(result=WebPage)
    def createWebPage(self):
        page = WebPage(self)
        page.applySettings(self.m_defaultPageSettings)
        page.setNetworkAccessManager(self.m_netAccessMan)
        return page

    @pyqtProperty('QVariantMap')
    def defaultPageSettings(self):
        return self.m_defaultPageSettings

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_terminated = True
        self.m_returnValue = code
        QApplication.instance().exit(code)

    @pyqtProperty('QVariantMap')
    def version(self):
        version = {
            'major': version_major,
            'minor': version_minor,
            'patch': version_patch
        }
        return version

    do_action('Phantom', Bunch(locals()))
Exemple #56
0
    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = QString()
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = QString.fromUtf8(args.script[0].read())
        self.m_scriptFile = args.script[0].name
        self.m_args = args.script[1:]
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script[0].close()

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0],
                                  int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages,
                                            autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled,
                                            pluginsEnabled)
        self.m_page.settings().setAttribute(
            QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(
            QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled,
                                            True)
        self.m_page.settings().setLocalStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(
            QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal,
                                                   Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical,
                                                   Qt.ScrollBarAlwaysOff)

        # if our script was called in a different directory, change to it
        # to make any dealings with files be relative to the scripts directory
        if os.path.dirname(self.m_scriptFile):
            os.chdir(os.path.dirname(self.m_scriptFile))

        if self.m_verbose:
            m_netAccessMan = NetworkAccessManager(self)
            self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.connect(self.m_page.mainFrame(),
                     SIGNAL('javaScriptWindowObjectCleared()'), self.inject)
        self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
Exemple #57
0
 def createWebPage(self):
     page = WebPage(self)
     page.applySettings(self.m_defaultPageSettings)
     page.setNetworkAccessManager(self.m_netAccessMan)
     return page