Example #1
0
 def stopLoading(self):
     try:
         self.driver.execute_script("window.stop();")
         self.driver.find_elements_by_css_selector("*")[0].send_keys(
             Keys.CONTROL + 'Escape')
     except Exception as e:
         logError("Exception location: Browser.stopLoading()\n" + str(e),
                  self)
Example #2
0
 def noAjaxSleepThenCallback(self):
     try:
         self.html()
     except Exception as e:
         logError(
             "Exception location: browser.noAjaxSleepThenCallback()\n" +
             str(e), self)
     self.tryRelease()
Example #3
0
 def ajaxSleepThenCallback(self):
     self.doAjaxSleep()
     try:
         self.html()
     except Exception as e:
         logError(
             "Exception location: browser.ajaxSleepThenCallback()\n" +
             str(e), self)
     # Here we terminated the sleep and the callback, so we can unlock the object:
     self.tryRelease()
Example #4
0
    def checkProxy(self):
        """
            This method return False if the proxy is not correctly set.
        """
        if self.proxy is None:
            logError("Proxy not correctly set.", self)
            return False
        else:
            webSiteList = \
            [
                "http://fr.geoipview.com",
                # On this web site with a proxy, the page load a lot of "near ip", so it's slow:
                # "http://www.localiser-ip.com",
                "http://www.mon-ip.com",
                "https://www.expressvpn.com/what-is-my-ip",
                "https://www.adresseip.com",
            ]

            def getWebSiteIP(url):
                try:
                    data = self.html(url)["html"]
                    ip = re.search("\d+[.]\d+[.]\d+[.]\d+", data).group(0)
                    return ip
                except Exception as e:
                    #                     logWarning("Ip not found in " + url + " " + str(e), self)
                    return None

            self.driver.set_page_load_timeout(self.checkProxyTimeout)
            previousAjaxSleep = self.ajaxSleep
            self.ajaxSleep = 0.0
            ipWhithoutProxy = getIP()
            success = False
            # log("This computer ip is " + ipWhithoutProxy, self)
            for current in webSiteList:
                proxyIP = getWebSiteIP(current)
                if proxyIP is not None:
                    if self.proxy["ip"] != proxyIP:
                        break
                    if proxyIP == ipWhithoutProxy:
                        break
                    success = True
                    break
            self.ajaxSleep = previousAjaxSleep
            self.driver.set_page_load_timeout(self.pageLoadTimeout)
            if success:
                log(
                    "Successfully init " + self.name + " with proxy " +
                    proxyIP, self)
                return True
            else:
                logWarning(
                    self.name + " failed to use proxy " + self.proxy["ip"],
                    self)
                return False
Example #5
0
 def initDriver(self, retry=True):
     try:
         self.driver = webdriver.PhantomJS(
             self.phantomjsPath,
             service_args=self.getPhantomJSServiceArgs())
         self.driver.set_page_load_timeout(self.pageLoadTimeout)
     except Exception as e:
         if retry:
             time.sleep(2)
             self.initDriver(retry=False)
         else:
             logError("This driver can't be init: " + str(e), self)
Example #6
0
 def getDriverData(self):
     lastUrl = None
     title = None
     html = None
     try:
         lastUrl = self.driver.current_url
         title = self.driver.title.strip()
         html = self.driver.page_source
     except Exception as e:
         logError("Exception location: browser.getDriverData()\n" + str(e),
                  self)
     return (title, html, lastUrl)
Example #7
0
    def html(self, crawlingElement=None):
        """
            This function return data. Call "get" method instead if you gave a htmlCallback.
        """
        # We convert the url:
        crawlingElement = tryUrlToCrawlingElement(crawlingElement)
        try:
            # Here it's the user who call this method:
            if crawlingElement is not None:
                ok = self.get(crawlingElement)
            # Here if the htmlCallback is not None, it's the get method which call html():
            elif self.htmlCallback is not None:
                crawlingElement = self.currentCrawlingElement
                ok = self.lastGetIsOk
            else:
                logError(
                    "You can't be in both scenarios described in the doc.",
                    self)
                exit()

            # No we try to get some data:
            (title, html, lastUrl) = self.getDriverData()

            # And we get the status:
            status = self.getStatus(ok, crawlingElement, lastUrl, title, html)

            # Now we got all data, so we can make the data dict:
            data = \
            {
                "status": status,
                "crawlingElement": crawlingElement,
                "lastUrl": lastUrl,
                "html": html, # theHtml[0:20]
                "title": title,
            }

            # And we log informations:
            ip = " "
            if self.proxy is not None:
                ip = " (" + self.proxy["ip"] + ") "
            logInfo(
                str(status.name) + " from " + self.name + ip +
                str(crawlingElement.data), self)
            if status == Browser.REQUEST_STATUS.duplicate:
                logInfo(str(title), self)
        except Exception as e:
            logError("Exception location: browser.html()\n" + str(e), self)
            data = \
            {
                "status": Browser.REQUEST_STATUS.exception,
                "crawlingElement": crawlingElement,
                "lastUrl": None,
                "html": None,
                "title": None,
            }
        # Now if we have a callback, we have to throw the data:
        if self.htmlCallback is not None:
            self.htmlCallback(data, self)
        # Or we just return it:
        else:
            return data
        return None
Example #8
0
 def loadBlank(self):
     try:
         self.driver.get("about:blank")
     except Exception as e:
         logError("Exception location: Browser.loadBlank()\n" + str(e),
                  self)
Example #9
0
    def get(self, crawlingElement, pipCallback=None):
        """
            This function return True if the request succeeded.
        """
        # We convert the url:
        crawlingElement = tryUrlToCrawlingElement(crawlingElement)

        # Here we have a callback, we have to cacheLock the object:
        if self.htmlCallback is not None:
            self.acquire()
        try:
            # And now we can get the html and retain time duration:
            tt = TicToc()
            tt.tic(display=False)
            #             logInfo("Launching get for: " + str(url))
            if crawlingElement.type == CrawlingElement.TYPE.pipedMessage:
                pipCallback(self,
                            crawlingElement)  # Here the url is a piped message
            else:
                self.stopLoadingAndLoadBlank()
                logInfo("Get starting...", self)
                self.driver.get(crawlingElement.data)
                logInfo("Get DONE", self)
#             logInfo("get done for: " + str(url))

#             global debugCount
#             debugCount += 1

# We finally got something without exception, so we try to get data:
            (title, html, lastUrl) = self.getDriverData()

            # But here, if the status is not success, we set diffTime as the max:
            diffTime = tt.tic(display=False)
            if self.getStatus(True, crawlingElement, lastUrl, title,
                              html) != Browser.REQUEST_STATUS.success:
                diffTime = self.pageLoadTimeout

            # We add the score to the history:
            self.durationHistory.append(diffTime)

            # And we keep currentUrl and ok status for the "html()" method:
            self.currentCrawlingElement = crawlingElement
            self.lastGetIsOk = True
            self.lastIsDuplicate = False
            if title is not None and html is not None \
            and crawlingElement.type == CrawlingElement.TYPE.uniqueUrl:
                self.lastIsDuplicate = self.duplicates.isDuplicate \
                (
                    crawlingElement.data,
                    title,
                    html,
                    self.maxDuplicatePerDomain
                )

                # Now we add the download page to duplicates structure:
                Browser.duplicates.add \
                (
                    crawlingElement.data,
                    title,
                    html,
                )

            # Finally we exec the finally statement and we return True (i.e. request ok):
            return True
        except Exception as e:
            if not isinstance(e, TimeoutException):
                logError("Exception location: browser.get()\n" + str(e), self)
            # Here we got a timeout, so we set the score as the badest:
            self.durationHistory.append(self.pageLoadTimeout)

            # And we keep url and ok status for the "html()" method:
            self.currentCrawlingElement = crawlingElement
            self.lastGetIsOk = False

            # Finally we exec the finally statement and we return False (i.e. failed):
            return False
        # The finally is executed before the return statement
        finally:
            # If the request succeeded:
            if self.lastGetIsOk:
                # First if this is a duplicates (i.e. a "you've been blocked" page for instance),
                # we don't need to sleep but we call the callback to keep aware the crawler:
                if self.lastIsDuplicate:
                    theThread = Thread(target=self.noAjaxSleepThenCallback)
                    theThread.start()
                # Then if we don't have any callback, the caller of this funct is the
                # "html()" method of this object, so we just need to sleep:
                elif self.htmlCallback is None:
                    self.doAjaxSleep()
                # Else if we actually have a right web page without timeout
                # We sleep and we call the callback:
                else:
                    theThread = Thread(target=self.ajaxSleepThenCallback)
                    theThread.start()
            # If we got a timeout, we don't need to sleep:
            else:
                # If there are no callback, we don't do anything:
                if self.htmlCallback is None:
                    pass
                # Else we don't sleep but call the callback:
                # Or we have to sleep because we can have a timeoutWithContent...
                else:
                    theThread = Thread(target=self.noAjaxSleepThenCallback)
                    theThread.start()
Example #10
0
 def tryRelease(self):
     try:
         self.cacheLock.release()
     except Exception as e:
         logError("Exception location: browser.tryRelease()\n" + str(e),
                  self)
Example #11
0
        pass
    verbose = False
    try:
        verbose = args.verbose
        assert isinstance(verbose, bool)
    except:
        pass
    sleepInterval = 10
    try:
        sleepInterval = args.sleep
        assert sleepInterval > 0
        assert sleepInterval < 10000
    except:
        pass

    logger = Logger("oomwatcher.log")

    while True:
        time.sleep(10.0)
        if getRandomFloat() > 0.95:
            logInfo("getMemoryPercent() = " + str(getMemoryPercent()),
                    logger=logger)
            logInfo("threshold = " + str(threshold), logger=logger)
        if getMemoryPercent() > threshold:
            try:
                sh.bash(scriptPath
                        )  # sh.bash("/home/hayj/taonews-logs/kill-taonews.sh")
                logInfo("Script executed", logger=logger)
            except Exception as e:
                logError(str(e), logger=logger)