def stopLoading(self): try: self.driver.execute_script("window.stop();") self.driver.find_elements_by_css_selector("*")[0].send_keys( Keys.CONTROL + 'Escape') except Exception as e: logError("Exception location: Browser.stopLoading()\n" + str(e), self)
def noAjaxSleepThenCallback(self): try: self.html() except Exception as e: logError( "Exception location: browser.noAjaxSleepThenCallback()\n" + str(e), self) self.tryRelease()
def ajaxSleepThenCallback(self): self.doAjaxSleep() try: self.html() except Exception as e: logError( "Exception location: browser.ajaxSleepThenCallback()\n" + str(e), self) # Here we terminated the sleep and the callback, so we can unlock the object: self.tryRelease()
def checkProxy(self): """ This method return False if the proxy is not correctly set. """ if self.proxy is None: logError("Proxy not correctly set.", self) return False else: webSiteList = \ [ "http://fr.geoipview.com", # On this web site with a proxy, the page load a lot of "near ip", so it's slow: # "http://www.localiser-ip.com", "http://www.mon-ip.com", "https://www.expressvpn.com/what-is-my-ip", "https://www.adresseip.com", ] def getWebSiteIP(url): try: data = self.html(url)["html"] ip = re.search("\d+[.]\d+[.]\d+[.]\d+", data).group(0) return ip except Exception as e: # logWarning("Ip not found in " + url + " " + str(e), self) return None self.driver.set_page_load_timeout(self.checkProxyTimeout) previousAjaxSleep = self.ajaxSleep self.ajaxSleep = 0.0 ipWhithoutProxy = getIP() success = False # log("This computer ip is " + ipWhithoutProxy, self) for current in webSiteList: proxyIP = getWebSiteIP(current) if proxyIP is not None: if self.proxy["ip"] != proxyIP: break if proxyIP == ipWhithoutProxy: break success = True break self.ajaxSleep = previousAjaxSleep self.driver.set_page_load_timeout(self.pageLoadTimeout) if success: log( "Successfully init " + self.name + " with proxy " + proxyIP, self) return True else: logWarning( self.name + " failed to use proxy " + self.proxy["ip"], self) return False
def initDriver(self, retry=True): try: self.driver = webdriver.PhantomJS( self.phantomjsPath, service_args=self.getPhantomJSServiceArgs()) self.driver.set_page_load_timeout(self.pageLoadTimeout) except Exception as e: if retry: time.sleep(2) self.initDriver(retry=False) else: logError("This driver can't be init: " + str(e), self)
def getDriverData(self): lastUrl = None title = None html = None try: lastUrl = self.driver.current_url title = self.driver.title.strip() html = self.driver.page_source except Exception as e: logError("Exception location: browser.getDriverData()\n" + str(e), self) return (title, html, lastUrl)
def html(self, crawlingElement=None): """ This function return data. Call "get" method instead if you gave a htmlCallback. """ # We convert the url: crawlingElement = tryUrlToCrawlingElement(crawlingElement) try: # Here it's the user who call this method: if crawlingElement is not None: ok = self.get(crawlingElement) # Here if the htmlCallback is not None, it's the get method which call html(): elif self.htmlCallback is not None: crawlingElement = self.currentCrawlingElement ok = self.lastGetIsOk else: logError( "You can't be in both scenarios described in the doc.", self) exit() # No we try to get some data: (title, html, lastUrl) = self.getDriverData() # And we get the status: status = self.getStatus(ok, crawlingElement, lastUrl, title, html) # Now we got all data, so we can make the data dict: data = \ { "status": status, "crawlingElement": crawlingElement, "lastUrl": lastUrl, "html": html, # theHtml[0:20] "title": title, } # And we log informations: ip = " " if self.proxy is not None: ip = " (" + self.proxy["ip"] + ") " logInfo( str(status.name) + " from " + self.name + ip + str(crawlingElement.data), self) if status == Browser.REQUEST_STATUS.duplicate: logInfo(str(title), self) except Exception as e: logError("Exception location: browser.html()\n" + str(e), self) data = \ { "status": Browser.REQUEST_STATUS.exception, "crawlingElement": crawlingElement, "lastUrl": None, "html": None, "title": None, } # Now if we have a callback, we have to throw the data: if self.htmlCallback is not None: self.htmlCallback(data, self) # Or we just return it: else: return data return None
def loadBlank(self): try: self.driver.get("about:blank") except Exception as e: logError("Exception location: Browser.loadBlank()\n" + str(e), self)
def get(self, crawlingElement, pipCallback=None): """ This function return True if the request succeeded. """ # We convert the url: crawlingElement = tryUrlToCrawlingElement(crawlingElement) # Here we have a callback, we have to cacheLock the object: if self.htmlCallback is not None: self.acquire() try: # And now we can get the html and retain time duration: tt = TicToc() tt.tic(display=False) # logInfo("Launching get for: " + str(url)) if crawlingElement.type == CrawlingElement.TYPE.pipedMessage: pipCallback(self, crawlingElement) # Here the url is a piped message else: self.stopLoadingAndLoadBlank() logInfo("Get starting...", self) self.driver.get(crawlingElement.data) logInfo("Get DONE", self) # logInfo("get done for: " + str(url)) # global debugCount # debugCount += 1 # We finally got something without exception, so we try to get data: (title, html, lastUrl) = self.getDriverData() # But here, if the status is not success, we set diffTime as the max: diffTime = tt.tic(display=False) if self.getStatus(True, crawlingElement, lastUrl, title, html) != Browser.REQUEST_STATUS.success: diffTime = self.pageLoadTimeout # We add the score to the history: self.durationHistory.append(diffTime) # And we keep currentUrl and ok status for the "html()" method: self.currentCrawlingElement = crawlingElement self.lastGetIsOk = True self.lastIsDuplicate = False if title is not None and html is not None \ and crawlingElement.type == CrawlingElement.TYPE.uniqueUrl: self.lastIsDuplicate = self.duplicates.isDuplicate \ ( crawlingElement.data, title, html, self.maxDuplicatePerDomain ) # Now we add the download page to duplicates structure: Browser.duplicates.add \ ( crawlingElement.data, title, html, ) # Finally we exec the finally statement and we return True (i.e. request ok): return True except Exception as e: if not isinstance(e, TimeoutException): logError("Exception location: browser.get()\n" + str(e), self) # Here we got a timeout, so we set the score as the badest: self.durationHistory.append(self.pageLoadTimeout) # And we keep url and ok status for the "html()" method: self.currentCrawlingElement = crawlingElement self.lastGetIsOk = False # Finally we exec the finally statement and we return False (i.e. failed): return False # The finally is executed before the return statement finally: # If the request succeeded: if self.lastGetIsOk: # First if this is a duplicates (i.e. a "you've been blocked" page for instance), # we don't need to sleep but we call the callback to keep aware the crawler: if self.lastIsDuplicate: theThread = Thread(target=self.noAjaxSleepThenCallback) theThread.start() # Then if we don't have any callback, the caller of this funct is the # "html()" method of this object, so we just need to sleep: elif self.htmlCallback is None: self.doAjaxSleep() # Else if we actually have a right web page without timeout # We sleep and we call the callback: else: theThread = Thread(target=self.ajaxSleepThenCallback) theThread.start() # If we got a timeout, we don't need to sleep: else: # If there are no callback, we don't do anything: if self.htmlCallback is None: pass # Else we don't sleep but call the callback: # Or we have to sleep because we can have a timeoutWithContent... else: theThread = Thread(target=self.noAjaxSleepThenCallback) theThread.start()
def tryRelease(self): try: self.cacheLock.release() except Exception as e: logError("Exception location: browser.tryRelease()\n" + str(e), self)
pass verbose = False try: verbose = args.verbose assert isinstance(verbose, bool) except: pass sleepInterval = 10 try: sleepInterval = args.sleep assert sleepInterval > 0 assert sleepInterval < 10000 except: pass logger = Logger("oomwatcher.log") while True: time.sleep(10.0) if getRandomFloat() > 0.95: logInfo("getMemoryPercent() = " + str(getMemoryPercent()), logger=logger) logInfo("threshold = " + str(threshold), logger=logger) if getMemoryPercent() > threshold: try: sh.bash(scriptPath ) # sh.bash("/home/hayj/taonews-logs/kill-taonews.sh") logInfo("Script executed", logger=logger) except Exception as e: logError(str(e), logger=logger)