def __init__(self): esurl = Configure.configure().value("elasticsearch.url") poolsize = Configure.configure().value("elasticsearch.pool.maxsize") self._es_url = esurl self._pool_maxsize = poolsize self._es_domain = None if esurl.startswith("http://"): self._es_domain = esurl[7:] elif esurl.startswith("https://"): self._es_domain = esurl[8:] idx = self._es_domain.find("/") self._es_domain = self._es_domain[:idx] # httpexp = re.compile(r'^https?://') # httpma = httpexp.search(self._es_url) # print (httpma.span()) # s,e=httpma.span() # domain = p_es_url[e:] # i = domain.find("/") # if i >= 0: # domain=domain[:i] # print (domain) print("extract domain", self._es_domain) self._es_client = urllib3.HTTPConnectionPool( self._es_domain, maxsize=self._pool_maxsize)
def __init__(self, p_request_queue, p_alive_timeout=None): threading.Thread.__init__(self) self._iniWinHeight = Configure.configure().value( "headless.webdriver.iniBrowserWinHeight") self._iniWinWidth = Configure.configure().value( "headless.webdriver.iniBrowserWinWidth") self._driver_path = Configure.configure().value( "headless.webdriver.path") self._input = queue.Queue(1) self._output = asyncio.Queue(maxsize=1) self._alive_timeout = p_alive_timeout self._request_queue = p_request_queue self._events = {"getPage": self.getPage, "snapshot": self.getSnapShot} chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=" + str(self._iniWinWidth) + "x" + str(self._iniWinHeight)) driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=self._driver_path) self._driverwrapper = { "driver": driver, "instancetime": datetime.datetime.now(), "lastactivetime": datetime.datetime.now(), "usetimes": 0 }
def execute(self, p_job, p_exectime): scenario = p_job["scenario"] sid = scenario["scenarioId"] outputdir = Configure.configure().value("application.outputdir") rootdir = Configure.configure().value("application.rootdir") workdir = rootdir + "/" + "s" + "_" + sid + "_" + p_exectime tmpdir = rootdir + "/" + "s" + "_" + sid + "_" + p_exectime + "/tmp" print("task dir: %s, %s" % (workdir, tmpdir))
def fuckup(p_command=None): Main.rootdir = os.path.abspath('.') manager = Manager() #BaseManager.register('CrawlerPicker', CrawlerPicker) #manager = BaseManager() #Initialize application configure filename = "application-config.yml" Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command) #Initialize log Logger() #Initialize elasticsearch client ESHandler.ini() #Initialize job schedule #main_jod_queue = queue.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) main_jod_queue = ThreadSafeQueue(size=Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) crawler_picker = CrawlerPicker() Main.crawlerRegister = CrawlerRegister(p_crawler_picker=crawler_picker, p_main_jod_queue=main_jod_queue) Main.crawlerRegister.start() #main_jod_queue = manager.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) #main_jod_queue = Queue(maxsize=Configure.configure().value("scheduler.messageQueueSize", p_default=1000)) Main.parellelSchedule=ParellelSchedule(p_main_jod_queue=main_jod_queue) Main.parellelSchedule.start() #Main.parellelSchedule.run() #Main.crawlerRegister.daemon = True #Main.crawlerRegister.run() #registerserver = Configure.configure().value("server.crawler.healthServer.host") #registerport = Configure.configure().value("server.crawler.healthServer.port") #Main.jobSync = JobSync(p_queue=main_jod_queue, p_register={"host":registerserver, "port":registerport}, p_crawler_picker=crawler_picker) #Main.jobSync.start() #Start main thread loop #tornado.ioloop.IOLoop.current().start() #After start all sub process, we need invode join function to make shared object available #Main.jobSync.join() Main.crawlerRegister.join() #Initialize server try: # This is here to simulate application activity (which keeps the main thread alive). while True: time.sleep(2) except (KeyboardInterrupt, SystemExit): pass
def start(self): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # set send buffer size sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, self._send_buffer_size) # set recieve buffer size sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, self._recv_buffer_size) #sock.setblocking(0) sock.bind((self._host, self._port)) sock.listen(128) while True: conn, addr = sock.accept() print("Get connection accept") request = { "conn": conn, "addr": addr, "delimiter": self._delimiter } self._queue.put(request, block=True, timeout=Configure.configure().value( "headless.webdriver.requestWaittingTimeout")) print("Put connection to queue")
def __init__(self, p_name, p_host, p_port, p_sendBufferSize, p_recvBufferSize, p_queue, p_max_buffer_size=None, p_read_chunk_size=None, p_delimiter="\n"): TCPServer.__init__(self, max_buffer_size=p_max_buffer_size, read_chunk_size=p_read_chunk_size) self._name = p_name self._host = p_host self._port = p_port self._send_buffer_size = p_sendBufferSize self._recv_buffer_size = p_recvBufferSize self._delimiter = p_delimiter self._queue = p_queue self._request_waiting_timeout = Configure.configure().value( p_key="headless.webdriver.requestWaittingTimeout")
def getPage(self, p_body): #driverwrapper = None try: flag = p_body["flag"] if "flag" in p_body else None addr = p_body["addr"] if "addr" in p_body else None if addr == None: return "None" Logger.getLogger().info("Get page source: %s" % (addr)) #driverwrapper = self._driver_queue.get(timeout=Configure.configure().value("headless.webdriver.freeDriverWaittingTimeout")) #Logger.getLogger().info("Got driver") self._driverwrapper["lastactivetime"] = datetime.datetime.now() self._driverwrapper[ "usetimes"] = self._driverwrapper["usetimes"] + 1 driver = self._driverwrapper["driver"] driver.set_page_load_timeout(Configure.configure().value( "headless.webdriver.driverGetPageTimeout")) #await asyncio.sleep(40) driver.get(addr) Logger.getLogger().info("Get page source done") return driver.page_source except queue.Empty: Logger.getLogger().error("Driver pool is empty") #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"}) return "None" except TimeoutException: Logger.getLogger().error("Driver get page timeout") #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"}) return "None" finally: pass
async def callback(self, p_message=None): ret = None proc = None print("Request coming: ", p_message) try: proc = await asyncio.wait_for( self._driver_process_queue.get(), Configure.configure().value( "headless.webdriver.freeDriverWaittingTimeout")) Logger.getLogger().info("Got a web driver") proc.getInputQueue().put(p_message, block=False) print("put message: ", p_message) #await proc.execute(p_message) outq = proc.getOutputQueue() print("Waiting for response: ") ret = await asyncio.wait_for(outq.get(), timeout=None) print("Got response: ", ret) return ret except asyncio.TimeoutError: Logger.getLogger().error("Can't get free web driver") return "None" finally: if proc != None: self._driver_process_queue.put_nowait(proc)
def __init__(self, p_queue, p_request_queue): #Process.__init__(self) threading.Thread.__init__(self) self._ini_driver_num = Configure.configure().value( "headless.webdriver.iniBrowserNum") self._request_queue = p_request_queue #self._driver_index_map = {} #self._driver_queue = p_queue self._driver_process_queue = p_queue
def startlisten(p_name, p_prefix, p_queue, p_delimiter): port = Configure.configure().value(p_key=p_prefix + ".port") host = Configure.configure().value(p_key=p_prefix + ".host") sendBufferSize = Configure.configure().value(p_key=p_prefix + ".sendBufferSize") recvBufferSize = Configure.configure().value(p_key=p_prefix + ".recvBufferSize") server = StreamHandler(p_name=p_name, p_host=host, p_port=port, p_sendBufferSize=sendBufferSize, p_recvBufferSize=recvBufferSize, p_queue=p_queue, p_delimiter=p_delimiter) #server.listen( port ) print("Server[" + p_name + "] starts at " + str(port) + "...") server.start()
def __init__(self): self._level = {"INFO":logging.INFO, "WARNING":logging.WARNING, "DEBUG":logging.DEBUG, "ERROR":logging.ERROR} logdir = Configure.configure().value(p_key="logger.dir") loglevel = Configure.configure().value(p_key="logger.level") logunit = Configure.configure().value(p_key="logger.keepUnit") loginterval = Configure.configure().value(p_key="logger.keepInterval") logcount = Configure.configure().value(p_key="logger.keepCount") log_fmt = '%(asctime)s\tFile \"%(filename)s\"%(levelname)s: %(message)s' formatter = logging.Formatter(log_fmt) log_file_handler = TimedRotatingFileHandler(filename=logdir, when=logunit, interval=loginterval, backupCount=logcount) log_file_handler.suffix = "%Y-%m-%d" #log_file_handler.extMatch = re.compile(r"^\d{4}-\d{2}-\d{2}.log$") log_file_handler.setFormatter(formatter) logging.basicConfig(level=self._level[loglevel.upper()]) Logger.logger = logging.getLogger() Logger.logger.addHandler(log_file_handler) logging.getLogger('apscheduler').setLevel(logging.ERROR)
def __init__(self, p_schedule_server, p_leader): self._schedule_server = p_schedule_server self._leader = p_leader #self._scheduler = self.getSchedule() # reg = Configure.configure().value(p_key="scheduler.worknodes.register.interval") # hea = Configure.configure().value(p_key="scheduler.worknodes.health.interval") self._register_instance = None self._register_task = self.generateRegisterTask( p_interval=Configure.configure().value( p_key="worknode.registerInterval"))
def __init__(self, p_queue, p_request_queue): #Process.__init__(self) threading.Thread.__init__(self) self._interval = Configure.configure().value("headless.webdriver.browserIdleTimeMonitorInterval") self._max_idle_time = Configure.configure().value("headless.webdriver.maxBrowserIdleTime") self._alertMinAvailableNum = Configure.configure().value("headless.webdriver.alertMinAvailableNum") self._alertMaxAvailableNum = Configure.configure().value("headless.webdriver.alertMaxAvailableNum") self._monitorMinAvailableNum = Configure.configure().value("headless.webdriver.monitorMinAvailableNum") self._monitorMaxAvailableNum = Configure.configure().value("headless.webdriver.monitorMaxAvailableNum") self._iniBrowserNum = Configure.configure().value("headless.webdriver.iniBrowserNum") self._iniWinHeight = Configure.configure().value("headless.webdriver.iniBrowserWinHeight") self._iniWinWidth = Configure.configure().value("headless.webdriver.iniBrowserWinWidth") self._driver_path = Configure.configure().value("headless.webdriver.path") self._alert_used_rate = 0.5 #self._alert_job_interval = 5 self._driver_queue = p_queue self._request_queue = p_request_queue
def __init__(self, p_addr, p_node_name, p_monitor): self._node_id = p_node_name self._host = p_addr self._port = Configure.configure().value("server.nodeServer.port") self._max_working_proc_num = Configure.configure().value( "worknode.maxWorkerNum") self._work_queue = Manager().Queue( Configure.configure().value("worknode.mainWorkQueueSize") ) #queue.Queue(Configure.configure().value("worknode.mainWorkQueueSize")) self._monitor = p_monitor #self._processing_sid_ary = [] self._events = {"work": self.accept} for i in range(self._max_working_proc_num): mate = Mate(p_leader=self, p_queue=self._work_queue) # mate.daemon = True mate.start() self._monitor.newProc(p_proc=mate)
def listen(p_name, p_prefix, p_handler): port = Configure.configure().value(p_key=p_prefix + ".port") # sbuf = Configure.configure().value(p_key=p_prefix+".sendBufferSize") # rbuf = Configure.configure().value(p_key=p_prefix+".recvBufferSize") # delimiter = Configure.configure().value(p_key=p_prefix+".delimiter") # s=Server(p_port=port, p_callback=p_handler, p_delimiter=delimiter, p_recv_buf=rbuf) # s.listen() server = SimpleTcpServer(p_name=p_name, p_callback=p_handler) server.listen(port) server.start() print("Server[" + p_name + "] starts at " + str(port) + "...")
def fuckup(p_command=None): start = datetime.datetime.now() Main.rootdir = os.path.abspath('.') manager = Manager() #Initialize application configure filename = "application-config.yml" Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command) #Initialize log Logger() Logger.getLogger().info("Web Driver Pool Launching......") #Initialize driver pool driver_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxBrowserNum")) request_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxRequestAcceptNum")) #Manager().Queue(Configure.configure().value("headless.webdriver.maxBrowserNum")) Main.webDriverContainer = WebDriverContainer( p_queue = driver_queue, p_request_queue = request_queue ) Main.webDriverContainer.run() #Main.pooledWebDriverManager = PooledWebDriverManager(p_queue = queue) #Main.pooledWebDriverManager.start() end = datetime.datetime.now() duration = (start-end).seconds Logger.getLogger().info("Web Driver Pool Launched after %d seconds"%(duration)) try: delimiter = Configure.configure().value("server.webdriverServer.delimiter") deary = delimiter.split('\\x') #print ("delimiter's array: ", deary) destr = '' for i in range(len(deary)): if deary[i] != '': de = chr(int(deary[i],16)) destr = de + destr StreamHandler.startlisten(p_name="Headless-Webdriver-Server", p_prefix="server.webdriverServer", p_queue=request_queue, p_delimiter=destr) #tornado.ioloop.IOLoop.current().start() except (KeyboardInterrupt, SystemExit): pass
def check(self): drivernum = self._driver_queue.qsize() requestnum = self._request_queue.qsize() print ("current driver num is %d, waiting request num is %d"%(drivernum, requestnum)) if round(requestnum/drivernum, 4) >= 0.5: print ("waiting request num is half of driver num") for i in range(self._iniBrowserNum): try: timeout = int(Configure.configure().value("headless.webdriver.addedNewDriverProcessAliveTimeout")) + i proc = DriverProcess(p_request_queue=self._request_queue, p_alive_timeout=timeout) self._driver_queue.put(proc, block=False) proc.start() except: pass
def fuckup(p_command=None): Main.rootdir = os.path.abspath('.') #Initialize application configure filename = "application-config.yml" Configure.load(p_dir=Main.rootdir + "/" + filename, p_command=p_command) nodename = Configure.configure().value("worknode.name") try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) Main.ipAddr = s.getsockname()[0] finally: s.close() #Initialize log Logger() #Initialize elasticsearch client Main.es_client = ESHandler() #Initialize worker monitor monitor = MultiProcessJobWatcher() executors = { 'default': ThreadPoolExecutor(1), 'processpool': ProcessPoolExecutor(1) } job_defaults = {'coalesce': True, 'max_instances': 1} mosche = BackgroundScheduler(executors=executors, job_defaults=job_defaults, timezone=utc) mosche.add_job(monitor, 'interval', seconds=Configure.configure().value( "worknode.workerMonitorInterval")) #Initialize worker leader leader = Leader(p_addr=Main.ipAddr, p_node_name=nodename, p_monitor=monitor) #Initialize node register and health info report schedule scheduleserveraddr = Configure.configure().value( "server.healthServer.host") scheduleserverport = Configure.configure().value( "server.healthServer.port") scheduleserver = { "host": scheduleserveraddr, "port": scheduleserverport } Main.communicator = Communicator(p_schedule_server=scheduleserver, p_leader=leader) #Initialize node job accept service ServerWrapper.listen(p_name=nodename, p_prefix="server.nodeServer", p_handler=leader) tornado.ioloop.IOLoop.current().start() try: # This is here to simulate application activity (which keeps the main thread alive). while True: time.sleep(2) except (KeyboardInterrupt, SystemExit): # Not strictly necessary if daemonic mode is enabled but should be done if possible parellelSchedule.shutdown()
def __init__(self): self._max_working_time_per_worker = Configure.configure().value("worknode.maxWorkerNum") self._proc_pool = {}