Exemple #1
0
    def _handle_message(self, websock, message):
        # self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
        # self.logger.debug("message from {} - {}".format(websock.url, message))
        message = json.loads(message)
        if "method" in message and message[
                "method"] == "Network.requestWillBeSent":
            if self._behavior:
                self._behavior.notify_of_activity()
            if message["params"]["request"]["url"].lower().startswith("data:"):
                self.logger.debug("ignoring data url {}".format(
                    message["params"]["request"]["url"][:80]))
            elif self.on_request:
                self.on_request(message)
        elif "method" in message and message["method"] == "Page.loadEventFired":
            if self._behavior is None:
                self.logger.info(
                    "Page.loadEventFired, starting behaviors url={} message={}"
                    .format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
            else:
                self.logger.warn(
                    "Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}"
                    .format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
        elif "method" in message and message[
                "method"] == "Console.messageAdded":
            self.logger.debug("{} console.{} {}".format(
                websock.url, message["params"]["message"]["level"],
                message["params"]["message"]["text"]))
        elif "method" in message and message["method"] == "Debugger.paused":
            # We hit the breakpoint set in visit_page. Get rid of google
            # analytics script!

            self.logger.debug("debugger paused! message={}".format(message))
            scriptId = message['params']['callFrames'][0]['location'][
                'scriptId']

            # replace script
            self.send_to_chrome(
                method="Debugger.setScriptSource",
                params={
                    "scriptId": scriptId,
                    "scriptSource":
                    "console.log('google analytics is no more!');"
                })

            # resume execution
            self.send_to_chrome(method="Debugger.resume")
        elif "result" in message:
            if self._behavior and self._behavior.is_waiting_on_result(
                    message['id']):
                self._behavior.notify_of_result(message)
Exemple #2
0
    def _handle_message(self, websock, message):
        # self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
        # self.logger.debug("message from {} - {}".format(websock.url, message))
        message = json.loads(message)
        if "method" in message and message["method"] == "Network.requestWillBeSent":
            if self._behavior:
                self._behavior.notify_of_activity()
            if message["params"]["request"]["url"].lower().startswith("data:"):
                self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
            elif self.on_request:
                self.on_request(message)
        elif "method" in message and message["method"] == "Page.loadEventFired":
            if self._behavior is None:
                self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
            else:
                self.logger.warn(
                    "Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}".format(
                        self.url, message
                    )
                )
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
        elif "method" in message and message["method"] == "Console.messageAdded":
            self.logger.debug(
                "{} console.{} {}".format(
                    websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"]
                )
            )
        elif "method" in message and message["method"] == "Debugger.paused":
            # We hit the breakpoint set in visit_page. Get rid of google
            # analytics script!

            self.logger.debug("debugger paused! message={}".format(message))
            scriptId = message["params"]["callFrames"][0]["location"]["scriptId"]

            # replace script
            self.send_to_chrome(
                method="Debugger.setScriptSource",
                params={"scriptId": scriptId, "scriptSource": "console.log('google analytics is no more!');"},
            )

            # resume execution
            self.send_to_chrome(method="Debugger.resume")
        elif "result" in message:
            if self._behavior and self._behavior.is_waiting_on_result(message["id"]):
                self._behavior.notify_of_result(message)
Exemple #3
0
class Browser:
    """Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Currently the implementation starts up a new instance
    of chrome for each page browsed, always on the same debug port. (In the
    future, it may keep the browser running indefinitely.)"""

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self._behavior = None
        self._websock = None
        self._abort_browse_page = False
        self._chrome_instance = None

    def __repr__(self):
        return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port)

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        # these can raise exceptions
        self._work_dir = tempfile.TemporaryDirectory()
        self._chrome_instance = Chrome(self.chrome_port, self.chrome_exe,
                self._work_dir.name, os.sep.join([self._work_dir.name, "chrome-user-data"]))
        self._websocket_url = self._chrome_instance.start()

    def stop(self):
        self._chrome_instance.stop()
        self._work_dir.cleanup()

    def abort_browse_page(self):
        self._abort_browse_page = True

    def browse_page(self, url, on_request=None):
        """Synchronously browses a page and runs behaviors. 

        Raises BrowsingException if browsing the page fails in a non-critical
        way.
        """
        self.url = url
        self.on_request = on_request

        self._websock = websocket.WebSocketApp(self._websocket_url,
                on_open=self._visit_page, on_message=self._handle_message)

        import random
        threadName = "WebsockThread{}-{}".format(self.chrome_port,
                ''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
        websock_thread = threading.Thread(target=self._websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5})
        websock_thread.start()
        self._start = time.time()
        aborted = False

        try:
            while True:
                time.sleep(0.5)
                if not self._websock or not self._websock.sock or not self._websock.sock.connected:
                    raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
                elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
                    self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
                    return
                elif self._behavior != None and self._behavior.is_finished():
                    self.logger.info("finished browsing page according to behavior url={}".format(self.url))
                    return
                elif self._abort_browse_page:
                    raise BrowsingException("browsing page aborted")
        finally:
            if self._websock and self._websock.sock and self._websock.sock.connected:
                try:
                    self._websock.close()
                except BaseException as e:
                    self.logger.error("exception closing websocket {} - {}".format(self._websock, e))

            websock_thread.join(timeout=30)
            if websock_thread.is_alive():
                self.logger.error("{} still alive 30 seconds after closing {}, will forcefully nudge it again".format(websock_thread, self._websock))
                self._websock.keep_running = False
                websock_thread.join(timeout=30)
                if websock_thread.is_alive():
                    self.logger.critical("{} still alive 60 seconds after closing {}".format(websock_thread, self._websock))

            self._behavior = None

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self.command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs)
        if not suppress_logging:
            self.logger.debug('sending message to {}: {}'.format(self._websock, msg))
        self._websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _handle_message(self, websock, message):
        # self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
        # self.logger.debug("message from {} - {}".format(websock.url, message))
        message = json.loads(message)
        if "method" in message and message["method"] == "Network.requestWillBeSent":
            if self._behavior:
                self._behavior.notify_of_activity()
            if message["params"]["request"]["url"].lower().startswith("data:"):
                self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
            elif self.on_request:
                self.on_request(message)
        elif "method" in message and message["method"] == "Page.loadEventFired":
            if self._behavior is None:
                self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
            else:
                self.logger.warn("Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}".format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
        elif "method" in message and message["method"] == "Console.messageAdded":
            self.logger.debug("{} console.{} {}".format(websock.url,
                message["params"]["message"]["level"],
                message["params"]["message"]["text"]))
        elif "method" in message and message["method"] == "Debugger.paused":
            # We hit the breakpoint set in visit_page. Get rid of google
            # analytics script!

            self.logger.debug("debugger paused! message={}".format(message))
            scriptId = message['params']['callFrames'][0]['location']['scriptId']

            # replace script
            self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})

            # resume execution
            self.send_to_chrome(method="Debugger.resume")
        elif "result" in message:
            if self._behavior and self._behavior.is_waiting_on_result(message['id']):
                self._behavior.notify_of_result(message)
Exemple #4
0
class Browser:
    """Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Currently the implementation starts up a new instance
    of chrome for each page browsed, always on the same debug port. (In the
    future, it may keep the browser running indefinitely.)"""

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self._behavior = None
        self._websock = None
        self._abort_browse_page = False
        self._chrome_instance = None

    def __repr__(self):
        return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__,
                                 self.chrome_port)

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        # these can raise exceptions
        self._work_dir = tempfile.TemporaryDirectory()
        self._chrome_instance = Chrome(
            self.chrome_port, self.chrome_exe, self._work_dir.name,
            os.sep.join([self._work_dir.name, "chrome-user-data"]))
        self._websocket_url = self._chrome_instance.start()

    def stop(self):
        self._chrome_instance.stop()
        try:
            self._work_dir.cleanup()
        except:
            self.logger.error("exception deleting %s",
                              self._work_dir,
                              exc_info=True)

    def abort_browse_page(self):
        self._abort_browse_page = True

    def browse_page(self, url, on_request=None):
        """Synchronously browses a page and runs behaviors. 

        Raises BrowsingException if browsing the page fails in a non-critical
        way.
        """
        self.url = url
        self.on_request = on_request

        self._websock = websocket.WebSocketApp(self._websocket_url,
                                               on_open=self._visit_page,
                                               on_message=self._handle_message)

        import random
        threadName = "WebsockThread{}-{}".format(
            self.chrome_port, ''.join((random.choice(
                'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            ) for _ in range(6))))
        websock_thread = threading.Thread(target=self._websock.run_forever,
                                          name=threadName,
                                          kwargs={'ping_timeout': 0.5})
        websock_thread.start()
        self._start = time.time()
        aborted = False

        try:
            while True:
                time.sleep(0.5)
                if not self._websock or not self._websock.sock or not self._websock.sock.connected:
                    raise BrowsingException(
                        "websocket closed, did chrome die? {}".format(
                            self._websocket_url))
                elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
                    self.logger.info(
                        "finished browsing page, reached hard timeout of {} seconds url={}"
                        .format(Browser.HARD_TIMEOUT_SECONDS, self.url))
                    return
                elif self._behavior != None and self._behavior.is_finished():
                    self.logger.info(
                        "finished browsing page according to behavior url={}".
                        format(self.url))
                    return
                elif self._abort_browse_page:
                    raise BrowsingException("browsing page aborted")
        finally:
            if self._websock and self._websock.sock and self._websock.sock.connected:
                try:
                    self._websock.close()
                except BaseException as e:
                    self.logger.error(
                        "exception closing websocket {} - {}".format(
                            self._websock, e))

            websock_thread.join(timeout=30)
            if websock_thread.is_alive():
                self.logger.error(
                    "{} still alive 30 seconds after closing {}, will forcefully nudge it again"
                    .format(websock_thread, self._websock))
                self._websock.keep_running = False
                websock_thread.join(timeout=30)
                if websock_thread.is_alive():
                    self.logger.critical(
                        "{} still alive 60 seconds after closing {}".format(
                            websock_thread, self._websock))

            self._behavior = None

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self.command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs)
        if not suppress_logging:
            self.logger.debug('sending message to {}: {}'.format(
                self._websock, msg))
        self._websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(
            method="Debugger.setBreakpointByUrl",
            params={
                "lineNumber": 1,
                "urlRegex": "https?://www.google-analytics.com/analytics.js"
            })

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _handle_message(self, websock, message):
        # self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
        # self.logger.debug("message from {} - {}".format(websock.url, message))
        message = json.loads(message)
        if "method" in message and message[
                "method"] == "Network.requestWillBeSent":
            if self._behavior:
                self._behavior.notify_of_activity()
            if message["params"]["request"]["url"].lower().startswith("data:"):
                self.logger.debug("ignoring data url {}".format(
                    message["params"]["request"]["url"][:80]))
            elif self.on_request:
                self.on_request(message)
        elif "method" in message and message["method"] == "Page.loadEventFired":
            if self._behavior is None:
                self.logger.info(
                    "Page.loadEventFired, starting behaviors url={} message={}"
                    .format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
            else:
                self.logger.warn(
                    "Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}"
                    .format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
        elif "method" in message and message[
                "method"] == "Console.messageAdded":
            self.logger.debug("{} console.{} {}".format(
                websock.url, message["params"]["message"]["level"],
                message["params"]["message"]["text"]))
        elif "method" in message and message["method"] == "Debugger.paused":
            # We hit the breakpoint set in visit_page. Get rid of google
            # analytics script!

            self.logger.debug("debugger paused! message={}".format(message))
            scriptId = message['params']['callFrames'][0]['location'][
                'scriptId']

            # replace script
            self.send_to_chrome(
                method="Debugger.setScriptSource",
                params={
                    "scriptId": scriptId,
                    "scriptSource":
                    "console.log('google analytics is no more!');"
                })

            # resume execution
            self.send_to_chrome(method="Debugger.resume")
        elif "result" in message:
            if self._behavior and self._behavior.is_waiting_on_result(
                    message['id']):
                self._behavior.notify_of_result(message)
Exemple #5
0
class Browser:
    """Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Currently the implementation starts up a new instance
    of chrome for each page browsed, always on the same debug port. (In the
    future, it may keep the browser running indefinitely.)"""

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', chrome_wait=60):
        self.command_id = itertools.count(1)
        self._lock = threading.Lock()
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self.chrome_wait = chrome_wait
        self._behavior = None
        self.websock = None
        self._shutdown_now = False

    def shutdown_now(self):
        self._shutdown_now = True

    def browse_page(self, url, on_request=None):
        """Synchronously browses a page and runs behaviors. First blocks to
        acquire lock to ensure we only browse one page at a time."""
        with self._lock:
            self.url = url
            self.on_request = on_request
            with tempfile.TemporaryDirectory() as user_data_dir:
                with Chrome(self.chrome_port, self.chrome_exe, self.chrome_wait, user_data_dir) as websocket_url:
                    self.websock = websocket.WebSocketApp(websocket_url,
                            on_open=self._visit_page,
                            on_message=self._handle_message)

                    import random
                    threadName = "WebsockThread{}-{}".format(self.chrome_port,
                            ''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
                    websock_thread = threading.Thread(target=self.websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5})
                    websock_thread.start()
                    start = time.time()

                    while True:
                        time.sleep(0.5)
                        if not self.websock or not self.websock.sock or not self.websock.sock.connected:
                            self.logger.error("websocket closed, did chrome die??? {}".format(self.websock))
                            break
                        elif time.time() - start > Browser.HARD_TIMEOUT_SECONDS:
                            self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
                            break
                        elif self._behavior != None and self._behavior.is_finished():
                            self.logger.info("finished browsing page according to behavior url={}".format(self.url))
                            break
                        elif self._shutdown_now:
                            self.logger.warn("immediate shutdown requested")
                            break

                    try:
                        self.websock.close()
                    except BaseException as e:
                        self.logger.error("exception closing websocket {} - {}".format(self.websock, e))

                    websock_thread.join()
                    self._behavior = None

    def send_to_chrome(self, **kwargs):
        msg_id = next(self.command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs)
        self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
        self.websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _handle_message(self, websock, message):
        # self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
        # self.logger.debug("message from {} - {}".format(websock.url, message))
        message = json.loads(message)
        if "method" in message and message["method"] == "Network.requestWillBeSent":
            if self._behavior:
                self._behavior.notify_of_activity()
            if message["params"]["request"]["url"].lower().startswith("data:"):
                self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
            elif self.on_request:
                self.on_request(message)
        elif "method" in message and message["method"] == "Page.loadEventFired":
            if self._behavior is None:
                self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message))
                self._behavior = Behavior(self.url, self)
                self._behavior.start()
            else:
                self.logger.warn("Page.loadEventFired but behaviors already running url={} message={}".format(self.url, message))
        elif "method" in message and message["method"] == "Console.messageAdded":
            self.logger.debug("{} console.{} {}".format(websock.url,
                message["params"]["message"]["level"],
                message["params"]["message"]["text"]))
        elif "method" in message and message["method"] == "Debugger.paused":
            # We hit the breakpoint set in visit_page. Get rid of google
            # analytics script!

            self.logger.debug("debugger paused! message={}".format(message))
            scriptId = message['params']['callFrames'][0]['location']['scriptId']

            # replace script
            self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})

            # resume execution
            self.send_to_chrome(method="Debugger.resume")
        elif "result" in message:
            if self._behavior and self._behavior.is_waiting_on_result(message['id']):
                self._behavior.notify_of_result(message)