def _handle_message(self, websock, message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) message = json.loads(message) if "method" in message and message[ "method"] == "Network.requestWillBeSent": if self._behavior: self._behavior.notify_of_activity() if message["params"]["request"]["url"].lower().startswith("data:"): self.logger.debug("ignoring data url {}".format( message["params"]["request"]["url"][:80])) elif self.on_request: self.on_request(message) elif "method" in message and message["method"] == "Page.loadEventFired": if self._behavior is None: self.logger.info( "Page.loadEventFired, starting behaviors url={} message={}" .format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() else: self.logger.warn( "Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}" .format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() elif "method" in message and message[ "method"] == "Console.messageAdded": self.logger.debug("{} console.{} {}".format( websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"])) elif "method" in message and message["method"] == "Debugger.paused": # We hit the breakpoint set in visit_page. Get rid of google # analytics script! self.logger.debug("debugger paused! message={}".format(message)) scriptId = message['params']['callFrames'][0]['location'][ 'scriptId'] # replace script self.send_to_chrome( method="Debugger.setScriptSource", params={ "scriptId": scriptId, "scriptSource": "console.log('google analytics is no more!');" }) # resume execution self.send_to_chrome(method="Debugger.resume") elif "result" in message: if self._behavior and self._behavior.is_waiting_on_result( message['id']): self._behavior.notify_of_result(message)
def _handle_message(self, websock, message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) message = json.loads(message) if "method" in message and message["method"] == "Network.requestWillBeSent": if self._behavior: self._behavior.notify_of_activity() if message["params"]["request"]["url"].lower().startswith("data:"): self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80])) elif self.on_request: self.on_request(message) elif "method" in message and message["method"] == "Page.loadEventFired": if self._behavior is None: self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() else: self.logger.warn( "Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}".format( self.url, message ) ) self._behavior = Behavior(self.url, self) self._behavior.start() elif "method" in message and message["method"] == "Console.messageAdded": self.logger.debug( "{} console.{} {}".format( websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"] ) ) elif "method" in message and message["method"] == "Debugger.paused": # We hit the breakpoint set in visit_page. Get rid of google # analytics script! self.logger.debug("debugger paused! message={}".format(message)) scriptId = message["params"]["callFrames"][0]["location"]["scriptId"] # replace script self.send_to_chrome( method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource": "console.log('google analytics is no more!');"}, ) # resume execution self.send_to_chrome(method="Debugger.resume") elif "result" in message: if self._behavior and self._behavior.is_waiting_on_result(message["id"]): self._behavior.notify_of_result(message)
class Browser: """Runs chrome/chromium to synchronously browse one page at a time using worker.browse_page(). Currently the implementation starts up a new instance of chrome for each page browsed, always on the same debug port. (In the future, it may keep the browser running indefinitely.)""" logger = logging.getLogger(__module__ + "." + __qualname__) HARD_TIMEOUT_SECONDS = 20 * 60 def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe self._behavior = None self._websock = None self._abort_browse_page = False self._chrome_instance = None def __repr__(self): return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port) def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def start(self): # these can raise exceptions self._work_dir = tempfile.TemporaryDirectory() self._chrome_instance = Chrome(self.chrome_port, self.chrome_exe, self._work_dir.name, os.sep.join([self._work_dir.name, "chrome-user-data"])) self._websocket_url = self._chrome_instance.start() def stop(self): self._chrome_instance.stop() self._work_dir.cleanup() def abort_browse_page(self): self._abort_browse_page = True def browse_page(self, url, on_request=None): """Synchronously browses a page and runs behaviors. Raises BrowsingException if browsing the page fails in a non-critical way. """ self.url = url self.on_request = on_request self._websock = websocket.WebSocketApp(self._websocket_url, on_open=self._visit_page, on_message=self._handle_message) import random threadName = "WebsockThread{}-{}".format(self.chrome_port, ''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6)))) websock_thread = threading.Thread(target=self._websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5}) websock_thread.start() self._start = time.time() aborted = False try: while True: time.sleep(0.5) if not self._websock or not self._websock.sock or not self._websock.sock.connected: raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url)) elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS: self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url)) return elif self._behavior != None and self._behavior.is_finished(): self.logger.info("finished browsing page according to behavior url={}".format(self.url)) return elif self._abort_browse_page: raise BrowsingException("browsing page aborted") finally: if self._websock and self._websock.sock and self._websock.sock.connected: try: self._websock.close() except BaseException as e: self.logger.error("exception closing websocket {} - {}".format(self._websock, e)) websock_thread.join(timeout=30) if websock_thread.is_alive(): self.logger.error("{} still alive 30 seconds after closing {}, will forcefully nudge it again".format(websock_thread, self._websock)) self._websock.keep_running = False websock_thread.join(timeout=30) if websock_thread.is_alive(): self.logger.critical("{} still alive 60 seconds after closing {}".format(websock_thread, self._websock)) self._behavior = None def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self.command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs) if not suppress_logging: self.logger.debug('sending message to {}: {}'.format(self._websock, msg)) self._websock.send(msg) return msg_id def _visit_page(self, websock): self.send_to_chrome(method="Network.enable") self.send_to_chrome(method="Page.enable") self.send_to_chrome(method="Console.enable") self.send_to_chrome(method="Debugger.enable") self.send_to_chrome(method="Runtime.enable") # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused" self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"}) # navigate to the page! self.send_to_chrome(method="Page.navigate", params={"url": self.url}) def _handle_message(self, websock, message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) message = json.loads(message) if "method" in message and message["method"] == "Network.requestWillBeSent": if self._behavior: self._behavior.notify_of_activity() if message["params"]["request"]["url"].lower().startswith("data:"): self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80])) elif self.on_request: self.on_request(message) elif "method" in message and message["method"] == "Page.loadEventFired": if self._behavior is None: self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() else: self.logger.warn("Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}".format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() elif "method" in message and message["method"] == "Console.messageAdded": self.logger.debug("{} console.{} {}".format(websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"])) elif "method" in message and message["method"] == "Debugger.paused": # We hit the breakpoint set in visit_page. Get rid of google # analytics script! self.logger.debug("debugger paused! message={}".format(message)) scriptId = message['params']['callFrames'][0]['location']['scriptId'] # replace script self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"}) # resume execution self.send_to_chrome(method="Debugger.resume") elif "result" in message: if self._behavior and self._behavior.is_waiting_on_result(message['id']): self._behavior.notify_of_result(message)
class Browser: """Runs chrome/chromium to synchronously browse one page at a time using worker.browse_page(). Currently the implementation starts up a new instance of chrome for each page browsed, always on the same debug port. (In the future, it may keep the browser running indefinitely.)""" logger = logging.getLogger(__module__ + "." + __qualname__) HARD_TIMEOUT_SECONDS = 20 * 60 def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe self._behavior = None self._websock = None self._abort_browse_page = False self._chrome_instance = None def __repr__(self): return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port) def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def start(self): # these can raise exceptions self._work_dir = tempfile.TemporaryDirectory() self._chrome_instance = Chrome( self.chrome_port, self.chrome_exe, self._work_dir.name, os.sep.join([self._work_dir.name, "chrome-user-data"])) self._websocket_url = self._chrome_instance.start() def stop(self): self._chrome_instance.stop() try: self._work_dir.cleanup() except: self.logger.error("exception deleting %s", self._work_dir, exc_info=True) def abort_browse_page(self): self._abort_browse_page = True def browse_page(self, url, on_request=None): """Synchronously browses a page and runs behaviors. Raises BrowsingException if browsing the page fails in a non-critical way. """ self.url = url self.on_request = on_request self._websock = websocket.WebSocketApp(self._websocket_url, on_open=self._visit_page, on_message=self._handle_message) import random threadName = "WebsockThread{}-{}".format( self.chrome_port, ''.join((random.choice( 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' ) for _ in range(6)))) websock_thread = threading.Thread(target=self._websock.run_forever, name=threadName, kwargs={'ping_timeout': 0.5}) websock_thread.start() self._start = time.time() aborted = False try: while True: time.sleep(0.5) if not self._websock or not self._websock.sock or not self._websock.sock.connected: raise BrowsingException( "websocket closed, did chrome die? {}".format( self._websocket_url)) elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS: self.logger.info( "finished browsing page, reached hard timeout of {} seconds url={}" .format(Browser.HARD_TIMEOUT_SECONDS, self.url)) return elif self._behavior != None and self._behavior.is_finished(): self.logger.info( "finished browsing page according to behavior url={}". format(self.url)) return elif self._abort_browse_page: raise BrowsingException("browsing page aborted") finally: if self._websock and self._websock.sock and self._websock.sock.connected: try: self._websock.close() except BaseException as e: self.logger.error( "exception closing websocket {} - {}".format( self._websock, e)) websock_thread.join(timeout=30) if websock_thread.is_alive(): self.logger.error( "{} still alive 30 seconds after closing {}, will forcefully nudge it again" .format(websock_thread, self._websock)) self._websock.keep_running = False websock_thread.join(timeout=30) if websock_thread.is_alive(): self.logger.critical( "{} still alive 60 seconds after closing {}".format( websock_thread, self._websock)) self._behavior = None def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self.command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs) if not suppress_logging: self.logger.debug('sending message to {}: {}'.format( self._websock, msg)) self._websock.send(msg) return msg_id def _visit_page(self, websock): self.send_to_chrome(method="Network.enable") self.send_to_chrome(method="Page.enable") self.send_to_chrome(method="Console.enable") self.send_to_chrome(method="Debugger.enable") self.send_to_chrome(method="Runtime.enable") # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused" self.send_to_chrome( method="Debugger.setBreakpointByUrl", params={ "lineNumber": 1, "urlRegex": "https?://www.google-analytics.com/analytics.js" }) # navigate to the page! self.send_to_chrome(method="Page.navigate", params={"url": self.url}) def _handle_message(self, websock, message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) message = json.loads(message) if "method" in message and message[ "method"] == "Network.requestWillBeSent": if self._behavior: self._behavior.notify_of_activity() if message["params"]["request"]["url"].lower().startswith("data:"): self.logger.debug("ignoring data url {}".format( message["params"]["request"]["url"][:80])) elif self.on_request: self.on_request(message) elif "method" in message and message["method"] == "Page.loadEventFired": if self._behavior is None: self.logger.info( "Page.loadEventFired, starting behaviors url={} message={}" .format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() else: self.logger.warn( "Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}" .format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() elif "method" in message and message[ "method"] == "Console.messageAdded": self.logger.debug("{} console.{} {}".format( websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"])) elif "method" in message and message["method"] == "Debugger.paused": # We hit the breakpoint set in visit_page. Get rid of google # analytics script! self.logger.debug("debugger paused! message={}".format(message)) scriptId = message['params']['callFrames'][0]['location'][ 'scriptId'] # replace script self.send_to_chrome( method="Debugger.setScriptSource", params={ "scriptId": scriptId, "scriptSource": "console.log('google analytics is no more!');" }) # resume execution self.send_to_chrome(method="Debugger.resume") elif "result" in message: if self._behavior and self._behavior.is_waiting_on_result( message['id']): self._behavior.notify_of_result(message)
class Browser: """Runs chrome/chromium to synchronously browse one page at a time using worker.browse_page(). Currently the implementation starts up a new instance of chrome for each page browsed, always on the same debug port. (In the future, it may keep the browser running indefinitely.)""" logger = logging.getLogger(__module__ + "." + __qualname__) HARD_TIMEOUT_SECONDS = 20 * 60 def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', chrome_wait=60): self.command_id = itertools.count(1) self._lock = threading.Lock() self.chrome_port = chrome_port self.chrome_exe = chrome_exe self.chrome_wait = chrome_wait self._behavior = None self.websock = None self._shutdown_now = False def shutdown_now(self): self._shutdown_now = True def browse_page(self, url, on_request=None): """Synchronously browses a page and runs behaviors. First blocks to acquire lock to ensure we only browse one page at a time.""" with self._lock: self.url = url self.on_request = on_request with tempfile.TemporaryDirectory() as user_data_dir: with Chrome(self.chrome_port, self.chrome_exe, self.chrome_wait, user_data_dir) as websocket_url: self.websock = websocket.WebSocketApp(websocket_url, on_open=self._visit_page, on_message=self._handle_message) import random threadName = "WebsockThread{}-{}".format(self.chrome_port, ''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6)))) websock_thread = threading.Thread(target=self.websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5}) websock_thread.start() start = time.time() while True: time.sleep(0.5) if not self.websock or not self.websock.sock or not self.websock.sock.connected: self.logger.error("websocket closed, did chrome die??? {}".format(self.websock)) break elif time.time() - start > Browser.HARD_TIMEOUT_SECONDS: self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url)) break elif self._behavior != None and self._behavior.is_finished(): self.logger.info("finished browsing page according to behavior url={}".format(self.url)) break elif self._shutdown_now: self.logger.warn("immediate shutdown requested") break try: self.websock.close() except BaseException as e: self.logger.error("exception closing websocket {} - {}".format(self.websock, e)) websock_thread.join() self._behavior = None def send_to_chrome(self, **kwargs): msg_id = next(self.command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs) self.logger.debug('sending message to {}: {}'.format(self.websock, msg)) self.websock.send(msg) return msg_id def _visit_page(self, websock): self.send_to_chrome(method="Network.enable") self.send_to_chrome(method="Page.enable") self.send_to_chrome(method="Console.enable") self.send_to_chrome(method="Debugger.enable") self.send_to_chrome(method="Runtime.enable") # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused" self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"}) # navigate to the page! self.send_to_chrome(method="Page.navigate", params={"url": self.url}) def _handle_message(self, websock, message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) message = json.loads(message) if "method" in message and message["method"] == "Network.requestWillBeSent": if self._behavior: self._behavior.notify_of_activity() if message["params"]["request"]["url"].lower().startswith("data:"): self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80])) elif self.on_request: self.on_request(message) elif "method" in message and message["method"] == "Page.loadEventFired": if self._behavior is None: self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message)) self._behavior = Behavior(self.url, self) self._behavior.start() else: self.logger.warn("Page.loadEventFired but behaviors already running url={} message={}".format(self.url, message)) elif "method" in message and message["method"] == "Console.messageAdded": self.logger.debug("{} console.{} {}".format(websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"])) elif "method" in message and message["method"] == "Debugger.paused": # We hit the breakpoint set in visit_page. Get rid of google # analytics script! self.logger.debug("debugger paused! message={}".format(message)) scriptId = message['params']['callFrames'][0]['location']['scriptId'] # replace script self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"}) # resume execution self.send_to_chrome(method="Debugger.resume") elif "result" in message: if self._behavior and self._behavior.is_waiting_on_result(message['id']): self._behavior.notify_of_result(message)