Esempio n. 1
0
    async def callback(self, p_message=None):
        ret = None
        proc = None
        print("Request coming: ", p_message)
        try:
            proc = await asyncio.wait_for(
                self._driver_process_queue.get(),
                Configure.configure().value(
                    "headless.webdriver.freeDriverWaittingTimeout"))
            Logger.getLogger().info("Got a web driver")
            proc.getInputQueue().put(p_message, block=False)
            print("put message: ", p_message)
            #await proc.execute(p_message)
            outq = proc.getOutputQueue()
            print("Waiting for response: ")
            ret = await asyncio.wait_for(outq.get(), timeout=None)
            print("Got response: ", ret)
            return ret

        except asyncio.TimeoutError:
            Logger.getLogger().error("Can't get free web driver")
            return "None"
        finally:
            if proc != None:
                self._driver_process_queue.put_nowait(proc)
Esempio n. 2
0
    def getSnapShot(self, p_body):
        #driverwrapper = None
        try:
            addr = p_body["addr"] if "addr" in p_body else None
            if addr == None:
                return "None"

            #driverwrapper = self._driver_queue.get(timeout=Configure.configure().value("headless.webdriver.freeDriverWaittingTimeout"))
            self._driverwrapper["lastactivetime"] = datetime.datetime.now()
            self._driverwrapper[
                "usetimes"] = self._driverwrapper["usetimes"] + 1

            driver = self._driverwrapper["driver"]

            driver.get(addr)
            clientHeight = driver.execute_script(
                "return document.body.clientHeight;")
            cursize = driver.get_window_size()
            driver.set_window_size(cursize["width"], clientHeight)

            base64 = driver.get_screenshot_as_base64(addr)
            driver.set_window_size(cursize["width"], cursize["height"])

            return base64
        except queue.Empty:
            Logger.getLogger().error("Driver pool is empty")
            #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"})
            return "None"
        finally:
            pass
Esempio n. 3
0
    def handle(self, data):
        done = False
        jsonret = json.loads(data.decode())
        Logger.getLogger().info('Register returned from server: %d',
                                jsonret["status"])

        if jsonret["status"] == StatusCode.OK:
            self._isRegistered = True
        else:
            pass
Esempio n. 4
0
    def __call__(self):
        if self._isRegistered == False:
            self._node["event"] = "register"
            self._mp.put(self._node, block=False)
            print("register self to server", self._node)
            Logger.getLogger().info('Start register self to server: %s_%s:%d',
                                    self._node["id"], self._node["host"],
                                    int(self._node["port"]))

        else:
            pass
Esempio n. 5
0
    def run(self):
        Logger.getLogger().info("Initial web driver")
        for i in range(self._ini_driver_num):
            proc = DriverProcess(p_request_queue=self._request_queue)
            proc.start()
            self._driver_process_queue.put(proc, block=True)
            Logger.getLogger().info("Add one web driver...")

        self.pooledWebDriverManager = PooledWebDriverManager(
            p_queue=self._driver_process_queue,
            p_request_queue=self._request_queue)
        self.pooledWebDriverManager.start()
Esempio n. 6
0
    def getPage(self, p_body):
        #driverwrapper = None
        try:
            flag = p_body["flag"] if "flag" in p_body else None
            addr = p_body["addr"] if "addr" in p_body else None
            if addr == None:
                return "None"
            Logger.getLogger().info("Get page source: %s" % (addr))

            #driverwrapper = self._driver_queue.get(timeout=Configure.configure().value("headless.webdriver.freeDriverWaittingTimeout"))
            #Logger.getLogger().info("Got driver")
            self._driverwrapper["lastactivetime"] = datetime.datetime.now()
            self._driverwrapper[
                "usetimes"] = self._driverwrapper["usetimes"] + 1

            driver = self._driverwrapper["driver"]
            driver.set_page_load_timeout(Configure.configure().value(
                "headless.webdriver.driverGetPageTimeout"))
            #await asyncio.sleep(40)
            driver.get(addr)
            Logger.getLogger().info("Get page source done")
            return driver.page_source
        except queue.Empty:
            Logger.getLogger().error("Driver pool is empty")
            #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"})
            return "None"
        except TimeoutException:
            Logger.getLogger().error("Driver get page timeout")
            #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"})
            return "None"
        finally:
            pass
Esempio n. 7
0
 def run(self):
   executors = {
               'default': apscheduler.executors.pool.ThreadPoolExecutor(2),
               'processpool': apscheduler.executors.pool.ProcessPoolExecutor(2)
                }
   job_defaults = {
               'coalesce': True,
               'max_instances': 1
                }
   self._scheduler = BackgroundScheduler(executors=executors, job_defaults=job_defaults, timezone=utc)  
   #self._scheduler.add_job(self.checkLess,'interval',seconds=self._monitorMinAvailableNum)
   #self._scheduler.add_job(self.checkOverload,'interval',seconds=self._monitorMaxAvailableNum)
   self._scheduler.add_job(self.check,'interval',seconds=self._monitorMinAvailableNum)
   Logger.getLogger().info("Web driver pool manager starts")
   self._scheduler.start()
Esempio n. 8
0
  def checkLess(self):
    cursize = self._driver_queue.qsize()
    Logger.getLogger().info ("*** check minimum driver count, current queue size: %d"%(cursize))
    if cursize <= self._alertMinAvailableNum:
      Logger.getLogger().info ("Current queue size is less than alert minimum value: %d <= %d"%(cursize, self._alertMinAvailableNum) )
      for i in range(self._iniBrowserNum):
        try:  
          proc = DriverProcess()
          self._driver_queue.put(proc, block=False)
          proc.start()    
#           chrome_options = Options()
#           chrome_options.add_argument("--headless")
#           chrome_options.add_argument("--window-size="+str(self._iniWinWidth)+"x"+str(self._iniWinHeight))
#           driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=self._driver_path)
#           driverwrapper = {"driver": driver, "instancetime": datetime.datetime.now(), "lastactivetime": datetime.datetime.now(), "usetimes": 0 }
#           self._queue.put(driverwrapper, block=False)
        except:
          pass
Esempio n. 9
0
  def checkOverload(self):
    cursize = self._driver_queue.qsize()
    Logger.getLogger().info ("*** check idle driver count, current queue size: %d"%(cursize))
    if cursize >= self._alertMaxAvailableNum:
      Logger.getLogger().info ("Current queue size is great than alert idle value: %d >= %d"%(cursize, self._alertMaxAvailableNum) )
      num = self._alertMaxAvailableNum - cursize
      for i in range(num):
        try:
          proc = self._driver_queue.get(block=False)
          proc.raiseExc(SystemExit)
          #proc.getInputQueue.put("Over")
#           driverwrapper = proc.getDriverwrapper()
#           driver = driverwrapper["driver"]
#           driver.quit()
#           driver = None
#           driverwrapper = None
        except:
          pass
Esempio n. 10
0
    def start(self):
        print("register worker starts...", self._interval)
        while True:
            try:
                if self._isRegistered == False:
                    self._node["event"] = "register"
                    yield self._mp.put(self._node)
                    print("register self to server", self._node)
                    Logger.getLogger().info(
                        'Start register self to server: %s_%s:%d',
                        self._node["id"], self._node["host"],
                        int(self._node["port"]))

                else:
                    pass
            except:
                pass
            finally:
                yield tornado.gen.sleep(self._interval)
Esempio n. 11
0
    def run(self):
        sid = None
        while True:
            try:
                print("work node process[%s] waiting for scenario..." %
                      (self.pid))
                job_context = self._task_queue.get(block=True)
                print("work node process[%s] got message: ", job_context)
                job = job_context["job"]

                utc_time = datetime.datetime.utcfromtimestamp(time.time())
                exectime = utc_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
                Logger.getLogger().info(
                    "work node process[%s] got work, schedule time=>%s, accept time =>%s, execute time => %s"
                    % (self.pid, job_context["scheduletime"],
                       job_context["accepttime"], exectime))

                self.execute(p_job=job, p_exectime=exectime)
            except:
                traceback.print_exc()
Esempio n. 12
0
  def fuckup(p_command=None):
    start = datetime.datetime.now()
    Main.rootdir = os.path.abspath('.')
    manager = Manager()
    
    #Initialize application configure
    filename = "application-config.yml"
    Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command)
        
    #Initialize log    
    Logger()
    Logger.getLogger().info("Web Driver Pool Launching......")
    
    #Initialize driver pool
    driver_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxBrowserNum"))
    request_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxRequestAcceptNum"))
    #Manager().Queue(Configure.configure().value("headless.webdriver.maxBrowserNum"))

    Main.webDriverContainer = WebDriverContainer( p_queue = driver_queue, p_request_queue = request_queue )
    Main.webDriverContainer.run()
    
    #Main.pooledWebDriverManager = PooledWebDriverManager(p_queue = queue)
    #Main.pooledWebDriverManager.start()
    end = datetime.datetime.now()
    duration = (start-end).seconds
    Logger.getLogger().info("Web Driver Pool Launched after %d seconds"%(duration))
    
    try:
      delimiter = Configure.configure().value("server.webdriverServer.delimiter")
      deary = delimiter.split('\\x')
      #print ("delimiter's array: ", deary)
      destr = ''
      for i in range(len(deary)):
        if deary[i] != '':
          de = chr(int(deary[i],16))
          destr = de + destr  
      StreamHandler.startlisten(p_name="Headless-Webdriver-Server", p_prefix="server.webdriverServer", p_queue=request_queue, p_delimiter=destr)
      #tornado.ioloop.IOLoop.current().start()
    except (KeyboardInterrupt, SystemExit):
      pass
Esempio n. 13
0
  def fuckup(p_command=None):
    Main.rootdir = os.path.abspath('.')
    manager = Manager()
    #BaseManager.register('CrawlerPicker', CrawlerPicker)
    #manager = BaseManager()
    
    #Initialize application configure
    filename = "application-config.yml"
    Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command)
        
    #Initialize log    
    Logger()

    #Initialize elasticsearch client
    ESHandler.ini()
    
    #Initialize job schedule
    #main_jod_queue = queue.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000))
    main_jod_queue = ThreadSafeQueue(size=Configure.configure().value("scheduler.messageQueueSize", p_default=1000))

    crawler_picker = CrawlerPicker()
    Main.crawlerRegister = CrawlerRegister(p_crawler_picker=crawler_picker, p_main_jod_queue=main_jod_queue)
    Main.crawlerRegister.start()

    #main_jod_queue = manager.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000))
    #main_jod_queue = Queue(maxsize=Configure.configure().value("scheduler.messageQueueSize", p_default=1000))
    
    Main.parellelSchedule=ParellelSchedule(p_main_jod_queue=main_jod_queue)
    Main.parellelSchedule.start()
    #Main.parellelSchedule.run()
    
    #Main.crawlerRegister.daemon = True
    #Main.crawlerRegister.run()
    
    #registerserver = Configure.configure().value("server.crawler.healthServer.host")
    #registerport = Configure.configure().value("server.crawler.healthServer.port")
    #Main.jobSync = JobSync(p_queue=main_jod_queue, p_register={"host":registerserver, "port":registerport}, p_crawler_picker=crawler_picker)
    #Main.jobSync.start()
    #Start main thread loop
    #tornado.ioloop.IOLoop.current().start()
    
    #After start all sub process, we need invode join function to make shared object available
    #Main.jobSync.join()
    Main.crawlerRegister.join()
    
    #Initialize server
    try:
        # This is here to simulate application activity (which keeps the main thread alive).
        while True:
            time.sleep(2)
    except (KeyboardInterrupt, SystemExit):
      pass    
Esempio n. 14
0
    def fuckup(p_command=None):
        Main.rootdir = os.path.abspath('.')

        #Initialize application configure
        filename = "application-config.yml"
        Configure.load(p_dir=Main.rootdir + "/" + filename,
                       p_command=p_command)

        nodename = Configure.configure().value("worknode.name")
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            s.connect(('8.8.8.8', 80))
            Main.ipAddr = s.getsockname()[0]
        finally:
            s.close()

        #Initialize log
        Logger()

        #Initialize elasticsearch client
        Main.es_client = ESHandler()

        #Initialize worker monitor
        monitor = MultiProcessJobWatcher()
        executors = {
            'default': ThreadPoolExecutor(1),
            'processpool': ProcessPoolExecutor(1)
        }
        job_defaults = {'coalesce': True, 'max_instances': 1}
        mosche = BackgroundScheduler(executors=executors,
                                     job_defaults=job_defaults,
                                     timezone=utc)
        mosche.add_job(monitor,
                       'interval',
                       seconds=Configure.configure().value(
                           "worknode.workerMonitorInterval"))

        #Initialize worker leader
        leader = Leader(p_addr=Main.ipAddr,
                        p_node_name=nodename,
                        p_monitor=monitor)

        #Initialize node register and health info report schedule
        scheduleserveraddr = Configure.configure().value(
            "server.healthServer.host")
        scheduleserverport = Configure.configure().value(
            "server.healthServer.port")
        scheduleserver = {
            "host": scheduleserveraddr,
            "port": scheduleserverport
        }
        Main.communicator = Communicator(p_schedule_server=scheduleserver,
                                         p_leader=leader)

        #Initialize node job accept service
        ServerWrapper.listen(p_name=nodename,
                             p_prefix="server.nodeServer",
                             p_handler=leader)
        tornado.ioloop.IOLoop.current().start()

        try:
            # This is here to simulate application activity (which keeps the main thread alive).
            while True:
                time.sleep(2)
        except (KeyboardInterrupt, SystemExit):
            # Not strictly necessary if daemonic mode is enabled but should be done if possible
            parellelSchedule.shutdown()