async def callback(self, p_message=None): ret = None proc = None print("Request coming: ", p_message) try: proc = await asyncio.wait_for( self._driver_process_queue.get(), Configure.configure().value( "headless.webdriver.freeDriverWaittingTimeout")) Logger.getLogger().info("Got a web driver") proc.getInputQueue().put(p_message, block=False) print("put message: ", p_message) #await proc.execute(p_message) outq = proc.getOutputQueue() print("Waiting for response: ") ret = await asyncio.wait_for(outq.get(), timeout=None) print("Got response: ", ret) return ret except asyncio.TimeoutError: Logger.getLogger().error("Can't get free web driver") return "None" finally: if proc != None: self._driver_process_queue.put_nowait(proc)
def getSnapShot(self, p_body): #driverwrapper = None try: addr = p_body["addr"] if "addr" in p_body else None if addr == None: return "None" #driverwrapper = self._driver_queue.get(timeout=Configure.configure().value("headless.webdriver.freeDriverWaittingTimeout")) self._driverwrapper["lastactivetime"] = datetime.datetime.now() self._driverwrapper[ "usetimes"] = self._driverwrapper["usetimes"] + 1 driver = self._driverwrapper["driver"] driver.get(addr) clientHeight = driver.execute_script( "return document.body.clientHeight;") cursize = driver.get_window_size() driver.set_window_size(cursize["width"], clientHeight) base64 = driver.get_screenshot_as_base64(addr) driver.set_window_size(cursize["width"], cursize["height"]) return base64 except queue.Empty: Logger.getLogger().error("Driver pool is empty") #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"}) return "None" finally: pass
def handle(self, data): done = False jsonret = json.loads(data.decode()) Logger.getLogger().info('Register returned from server: %d', jsonret["status"]) if jsonret["status"] == StatusCode.OK: self._isRegistered = True else: pass
def __call__(self): if self._isRegistered == False: self._node["event"] = "register" self._mp.put(self._node, block=False) print("register self to server", self._node) Logger.getLogger().info('Start register self to server: %s_%s:%d', self._node["id"], self._node["host"], int(self._node["port"])) else: pass
def run(self): Logger.getLogger().info("Initial web driver") for i in range(self._ini_driver_num): proc = DriverProcess(p_request_queue=self._request_queue) proc.start() self._driver_process_queue.put(proc, block=True) Logger.getLogger().info("Add one web driver...") self.pooledWebDriverManager = PooledWebDriverManager( p_queue=self._driver_process_queue, p_request_queue=self._request_queue) self.pooledWebDriverManager.start()
def getPage(self, p_body): #driverwrapper = None try: flag = p_body["flag"] if "flag" in p_body else None addr = p_body["addr"] if "addr" in p_body else None if addr == None: return "None" Logger.getLogger().info("Get page source: %s" % (addr)) #driverwrapper = self._driver_queue.get(timeout=Configure.configure().value("headless.webdriver.freeDriverWaittingTimeout")) #Logger.getLogger().info("Got driver") self._driverwrapper["lastactivetime"] = datetime.datetime.now() self._driverwrapper[ "usetimes"] = self._driverwrapper["usetimes"] + 1 driver = self._driverwrapper["driver"] driver.set_page_load_timeout(Configure.configure().value( "headless.webdriver.driverGetPageTimeout")) #await asyncio.sleep(40) driver.get(addr) Logger.getLogger().info("Get page source done") return driver.page_source except queue.Empty: Logger.getLogger().error("Driver pool is empty") #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"}) return "None" except TimeoutException: Logger.getLogger().error("Driver get page timeout") #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"}) return "None" finally: pass
def run(self): executors = { 'default': apscheduler.executors.pool.ThreadPoolExecutor(2), 'processpool': apscheduler.executors.pool.ProcessPoolExecutor(2) } job_defaults = { 'coalesce': True, 'max_instances': 1 } self._scheduler = BackgroundScheduler(executors=executors, job_defaults=job_defaults, timezone=utc) #self._scheduler.add_job(self.checkLess,'interval',seconds=self._monitorMinAvailableNum) #self._scheduler.add_job(self.checkOverload,'interval',seconds=self._monitorMaxAvailableNum) self._scheduler.add_job(self.check,'interval',seconds=self._monitorMinAvailableNum) Logger.getLogger().info("Web driver pool manager starts") self._scheduler.start()
def checkLess(self): cursize = self._driver_queue.qsize() Logger.getLogger().info ("*** check minimum driver count, current queue size: %d"%(cursize)) if cursize <= self._alertMinAvailableNum: Logger.getLogger().info ("Current queue size is less than alert minimum value: %d <= %d"%(cursize, self._alertMinAvailableNum) ) for i in range(self._iniBrowserNum): try: proc = DriverProcess() self._driver_queue.put(proc, block=False) proc.start() # chrome_options = Options() # chrome_options.add_argument("--headless") # chrome_options.add_argument("--window-size="+str(self._iniWinWidth)+"x"+str(self._iniWinHeight)) # driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=self._driver_path) # driverwrapper = {"driver": driver, "instancetime": datetime.datetime.now(), "lastactivetime": datetime.datetime.now(), "usetimes": 0 } # self._queue.put(driverwrapper, block=False) except: pass
def checkOverload(self): cursize = self._driver_queue.qsize() Logger.getLogger().info ("*** check idle driver count, current queue size: %d"%(cursize)) if cursize >= self._alertMaxAvailableNum: Logger.getLogger().info ("Current queue size is great than alert idle value: %d >= %d"%(cursize, self._alertMaxAvailableNum) ) num = self._alertMaxAvailableNum - cursize for i in range(num): try: proc = self._driver_queue.get(block=False) proc.raiseExc(SystemExit) #proc.getInputQueue.put("Over") # driverwrapper = proc.getDriverwrapper() # driver = driverwrapper["driver"] # driver.quit() # driver = None # driverwrapper = None except: pass
def start(self): print("register worker starts...", self._interval) while True: try: if self._isRegistered == False: self._node["event"] = "register" yield self._mp.put(self._node) print("register self to server", self._node) Logger.getLogger().info( 'Start register self to server: %s_%s:%d', self._node["id"], self._node["host"], int(self._node["port"])) else: pass except: pass finally: yield tornado.gen.sleep(self._interval)
def run(self): sid = None while True: try: print("work node process[%s] waiting for scenario..." % (self.pid)) job_context = self._task_queue.get(block=True) print("work node process[%s] got message: ", job_context) job = job_context["job"] utc_time = datetime.datetime.utcfromtimestamp(time.time()) exectime = utc_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ') Logger.getLogger().info( "work node process[%s] got work, schedule time=>%s, accept time =>%s, execute time => %s" % (self.pid, job_context["scheduletime"], job_context["accepttime"], exectime)) self.execute(p_job=job, p_exectime=exectime) except: traceback.print_exc()
def fuckup(p_command=None): start = datetime.datetime.now() Main.rootdir = os.path.abspath('.') manager = Manager() #Initialize application configure filename = "application-config.yml" Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command) #Initialize log Logger() Logger.getLogger().info("Web Driver Pool Launching......") #Initialize driver pool driver_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxBrowserNum")) request_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxRequestAcceptNum")) #Manager().Queue(Configure.configure().value("headless.webdriver.maxBrowserNum")) Main.webDriverContainer = WebDriverContainer( p_queue = driver_queue, p_request_queue = request_queue ) Main.webDriverContainer.run() #Main.pooledWebDriverManager = PooledWebDriverManager(p_queue = queue) #Main.pooledWebDriverManager.start() end = datetime.datetime.now() duration = (start-end).seconds Logger.getLogger().info("Web Driver Pool Launched after %d seconds"%(duration)) try: delimiter = Configure.configure().value("server.webdriverServer.delimiter") deary = delimiter.split('\\x') #print ("delimiter's array: ", deary) destr = '' for i in range(len(deary)): if deary[i] != '': de = chr(int(deary[i],16)) destr = de + destr StreamHandler.startlisten(p_name="Headless-Webdriver-Server", p_prefix="server.webdriverServer", p_queue=request_queue, p_delimiter=destr) #tornado.ioloop.IOLoop.current().start() except (KeyboardInterrupt, SystemExit): pass
def fuckup(p_command=None): Main.rootdir = os.path.abspath('.') manager = Manager() #BaseManager.register('CrawlerPicker', CrawlerPicker) #manager = BaseManager() #Initialize application configure filename = "application-config.yml" Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command) #Initialize log Logger() #Initialize elasticsearch client ESHandler.ini() #Initialize job schedule #main_jod_queue = queue.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) main_jod_queue = ThreadSafeQueue(size=Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) crawler_picker = CrawlerPicker() Main.crawlerRegister = CrawlerRegister(p_crawler_picker=crawler_picker, p_main_jod_queue=main_jod_queue) Main.crawlerRegister.start() #main_jod_queue = manager.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) #main_jod_queue = Queue(maxsize=Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) Main.parellelSchedule=ParellelSchedule(p_main_jod_queue=main_jod_queue) Main.parellelSchedule.start() #Main.parellelSchedule.run() #Main.crawlerRegister.daemon = True #Main.crawlerRegister.run() #registerserver = Configure.configure().value("server.crawler.healthServer.host") #registerport = Configure.configure().value("server.crawler.healthServer.port") #Main.jobSync = JobSync(p_queue=main_jod_queue, p_register={"host":registerserver, "port":registerport}, p_crawler_picker=crawler_picker) #Main.jobSync.start() #Start main thread loop #tornado.ioloop.IOLoop.current().start() #After start all sub process, we need invode join function to make shared object available #Main.jobSync.join() Main.crawlerRegister.join() #Initialize server try: # This is here to simulate application activity (which keeps the main thread alive). while True: time.sleep(2) except (KeyboardInterrupt, SystemExit): pass
def fuckup(p_command=None): Main.rootdir = os.path.abspath('.') #Initialize application configure filename = "application-config.yml" Configure.load(p_dir=Main.rootdir + "/" + filename, p_command=p_command) nodename = Configure.configure().value("worknode.name") try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) Main.ipAddr = s.getsockname()[0] finally: s.close() #Initialize log Logger() #Initialize elasticsearch client Main.es_client = ESHandler() #Initialize worker monitor monitor = MultiProcessJobWatcher() executors = { 'default': ThreadPoolExecutor(1), 'processpool': ProcessPoolExecutor(1) } job_defaults = {'coalesce': True, 'max_instances': 1} mosche = BackgroundScheduler(executors=executors, job_defaults=job_defaults, timezone=utc) mosche.add_job(monitor, 'interval', seconds=Configure.configure().value( "worknode.workerMonitorInterval")) #Initialize worker leader leader = Leader(p_addr=Main.ipAddr, p_node_name=nodename, p_monitor=monitor) #Initialize node register and health info report schedule scheduleserveraddr = Configure.configure().value( "server.healthServer.host") scheduleserverport = Configure.configure().value( "server.healthServer.port") scheduleserver = { "host": scheduleserveraddr, "port": scheduleserverport } Main.communicator = Communicator(p_schedule_server=scheduleserver, p_leader=leader) #Initialize node job accept service ServerWrapper.listen(p_name=nodename, p_prefix="server.nodeServer", p_handler=leader) tornado.ioloop.IOLoop.current().start() try: # This is here to simulate application activity (which keeps the main thread alive). while True: time.sleep(2) except (KeyboardInterrupt, SystemExit): # Not strictly necessary if daemonic mode is enabled but should be done if possible parellelSchedule.shutdown()