Example #1
0
    def _page_load_event_fired(self, message):
        self.logger.info(
            "Page.loadEventFired, moving on to starting behaviors url={}".
            format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start(self.behavior_parameters)

        self._waiting_on_document_url_msg_id = self.send_to_chrome(
            method="Runtime.evaluate", params={"expression": "document.URL"})
Example #2
0
    def _page_load_event_fired(self, message):
        def page_url_after_load_event(message):
            if message["result"]["result"]["value"] != self.url:
                if self.on_url_change:
                    self.on_url_change(message["result"]["result"]["value"])
        msg_id = self.send_to_chrome(
                method="Runtime.evaluate",
                params={"expression":"document.URL"})
        self._waiting_on_result_messages[msg_id] = page_url_after_load_event

        self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start(self.behavior_parameters)
Example #3
0
 def _handle_result_message(self, message):
     if message["id"] == self._waiting_on_screenshot_msg_id:
         if self.on_screenshot:
             self.on_screenshot(base64.b64decode(message["result"]["data"]))
         self._waiting_on_screenshot_msg_id = None
         self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url))
         self._behavior = Behavior(self.url, self)
         self._behavior.start()
     elif message["id"] == self._waiting_on_outlinks_msg_id:
         self.logger.debug("got outlinks message=%s", message)
         self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
     elif message["id"] == self._waiting_on_document_url_msg_id:
         if message["result"]["result"]["value"] != self.url:
             if self.on_url_change:
                 self.on_url_change(message["result"]["result"]["value"])
         self._waiting_on_document_url_msg_id = None
     elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
         self._behavior.notify_of_result(message)
Example #4
0
class Browser:
    """
    Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Should not be accessed from multiple threads.
    """

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(
            self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None,
            ignore_cert_errors=False):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
        self._behavior = None
        self._websock = None
        self._abort_browse_page = False
        self._chrome_instance = None
        self._aw_snap_hes_dead_jim = None
        self._work_dir = None
        self._websocket_url = None

    def __repr__(self):
        return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port)

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self, proxy=None, cookie_db=None):
        if not self._chrome_instance:
            self._chrome_instance = Chrome(
                    port=self.chrome_port, executable=self.chrome_exe,
                    ignore_cert_errors=self.ignore_cert_errors,
                    proxy=proxy or self.proxy, cookie_db=None)
            try:
                self._websocket_url = self._chrome_instance.start()
            except:
                self._chrome_instance = None
                raise

    def stop(self):
        try:
            if self.is_running():
                self._chrome_instance.stop()
                self._chrome_instance = None
                self._websocket_url = None
        except:
            self.logger.error("problem stopping", exc_info=True)

    def is_running(self):
        return bool(self._websocket_url)

    def abort_browse_page(self):
        self._abort_browse_page = True

    def persist_and_read_cookie_db(self):
        if self._chrome_instance:
            return self._chrome_instance.persist_and_read_cookie_db()
        else:
            return None

    def browse_page(
            self, url, extra_headers=None, behavior_parameters=None,
            user_agent=None,
            on_request=None, on_response=None, on_screenshot=None,
            on_url_change=None):
        """
        Synchronously loads a page, runs behaviors, and takes a screenshot.

        Raises BrowsingException if browsing the page fails in a non-critical
        way.

        Returns extracted outlinks.
        """
        if not self.is_running():
            raise BrowsingException("browser has not been started")
        self.url = url
        self.extra_headers = extra_headers
        self.user_agent = user_agent
        self.on_request = on_request
        self.on_screenshot = on_screenshot
        self.on_url_change = on_url_change
        self.on_response = on_response
        self.behavior_parameters = behavior_parameters

        self._outlinks = None
        self._reached_limit = None
        self._aw_snap_hes_dead_jim = None
        self._abort_browse_page = False
        self._has_screenshot = False
        self._waiting_on_result_messages = {}
        self._result_message_timeout = None

        self._websock = websocket.WebSocketApp(
                self._websocket_url, on_open=self._visit_page,
                on_message=self._wrap_handle_message)

        thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
                self.chrome_port, datetime.datetime.utcnow())
        websock_thread = threading.Thread(
                target=self._websock.run_forever, name=thread_name,
                kwargs={'ping_timeout':0.5})
        websock_thread.start()
        self._start = time.time()
        aborted = False

        try:
            while True:
                time.sleep(0.5)
                if self._browse_interval_func():
                    return self._outlinks
        finally:
            if (self._websock and self._websock.sock
                    and self._websock.sock.connected):
                try:
                    self._websock.close()
                except BaseException as e:
                    self.logger.error(
                            "exception closing websocket %s - %s" % (
                                self._websock, e))

            websock_thread.join(timeout=30)
            if websock_thread.is_alive():
                self.logger.error(
                        "%s still alive 30 seconds after closing %s, will "
                        "forcefully nudge it again" % (
                            websock_thread, self._websock))
                self._websock.keep_running = False
                websock_thread.join(timeout=30)
                if websock_thread.is_alive():
                    self.logger.critical(
                            "%s still alive 60 seconds after closing %s" % (
                                websock_thread, self._websock))

            self._behavior = None

    OUTLINKS_JS = r"""
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
    __brzl_framesDone.add(frame);
    if (frame && frame.document) {
        var outlinks = Array.prototype.slice.call(
                frame.document.querySelectorAll('a[href]'));
        for (var i = 0; i < frame.frames.length; i++) {
            if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
                outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i]));
            }
        }
    }
    return outlinks;
}
__brzl_compileOutlinks(window).join('\n');
"""

    def _chain_chrome_messages(self, chain):
        """
        Sends a series of messages to chrome/chromium on the debugging protocol
        websocket. Waits for a reply from each one before sending the next.
        Enforces a timeout waiting for each reply. If the timeout is hit, sets
        self._result_message_timeout with a ResultMessageTimeout (an exception
        class). Takes an array of dicts, each of which should look like this:

            {
                "info": "human readable description",
                "chrome_msg": { ... },   # message to send to chrome, as a dict
                "timeout": 30,           # timeout in seconds
                "callback": my_callback, # takes one arg, the result message
            }

        The code is rather convoluted because of the asynchronous nature of the
        whole thing. See how it's used in _start_postbehavior_chain.
        """
        timer = None

        def callback(message):
            if timer:
                timer.cancel()
            if "callback" in chain[0]:
                chain[0]["callback"](message)
            self._chain_chrome_messages(chain[1:])

        def timeout():
            self._result_message_timeout = ResultMessageTimeout(
                    "timed out after %.1fs waiting for result message "
                    "for %s", chain[0]["timeout"], chain[0]["chrome_msg"])

        if chain:
            msg_id = self.send_to_chrome(**chain[0]["chrome_msg"])
            self._waiting_on_result_messages[msg_id] = callback
            self.logger.info(
                    "msg_id=%s for message %s", msg_id, chain[0]["chrome_msg"])
            timer = threading.Timer(chain[0]["timeout"], timeout)
            timer.daemon = True
            timer.start()
        else:
            self.logger.info("finished chrome message chain")

    def _start_postbehavior_chain(self):
        if self.on_screenshot:
            chain = [{
                "info": "scrolling to top",
                "chrome_msg": {
                    "method": "Runtime.evaluate",
                    "params": {"expression": "window.scrollTo(0, 0);"},
                },
                "timeout": 30,
                "callback": lambda message: None,
            }, {
                "info": "requesting screenshot",
                "chrome_msg": {"method": "Page.captureScreenshot"},
                "timeout": 30,
                "callback": lambda message: (
                        self.on_screenshot and self.on_screenshot(
                            base64.b64decode(message["result"]["data"]))),
            }]
        else:
            chain = []

        def set_outlinks(message):
            if message["result"]["result"]["value"]:
                self._outlinks = frozenset(
                        message["result"]["result"]["value"].split("\n"))
            else:
                self._outlinks = frozenset()

        chain.append({
            "info": "retrieving outlinks",
            "chrome_msg": {
                "method": "Runtime.evaluate",
                "params": {"expression": self.OUTLINKS_JS},
            },
            "timeout": 60,
            "callback": set_outlinks,
        })

        self._chain_chrome_messages(chain)

    def _browse_interval_func(self):
        """Called periodically while page is being browsed. Returns True when
        finished browsing."""
        if (not self._websock or not self._websock.sock
                or not self._websock.sock.connected):
            raise BrowsingException(
                    "websocket closed, did chrome die? {}".format(
                        self._websocket_url))
        elif self._result_message_timeout:
            raise self._result_message_timeout
        elif self._aw_snap_hes_dead_jim:
            raise BrowsingException(
                    """chrome tab went "aw snap" or "he's dead jim"!""")
        elif self._outlinks is not None:
            # setting self._outlinks is the last thing that happens in the
            # post-behavior chain
            return True
        elif (self._behavior != None and self._behavior.is_finished()
                or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS):
            if self._behavior and self._behavior.is_finished():
                self.logger.info(
                        "behavior decided it's finished with %s", self.url)
            else:
                self.logger.info(
                        "reached hard timeout of %s seconds url=%s",
                        Browser.HARD_TIMEOUT_SECONDS, self.url)
            self._behavior = None
            self._start_postbehavior_chain()
            return False
        elif self._reached_limit:
            raise self._reached_limit
        elif self._abort_browse_page:
            raise BrowsingAborted("browsing page aborted")
        else:
            return False

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self.command_id)
        kwargs["id"] = msg_id
        msg = json.dumps(kwargs)
        if not suppress_logging:
            self.logger.debug("sending message to %s: %s", self._websock, msg)
        self._websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        # navigate to about:blank here to avoid situation where we navigate to
        # the same page that we're currently on, perhaps with a different
        # #fragment, which prevents Page.loadEventFired from happening
        self.send_to_chrome(method="Page.navigate", params={"url": "about:blank"})

        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        headers = self.extra_headers or {}
        headers['Accept-Encoding'] = 'identity'
        self.send_to_chrome(
                method="Network.setExtraHTTPHeaders",
                params={"headers":headers})

        if self.user_agent:
            self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _wrap_handle_message(self, websock, message):
        try:
            self._handle_message(websock, message)
        except:
            self.logger.error(
                    "uncaught exception in _handle_message message=%s",
                    message, exc_info=True)
            self.abort_browse_page()

    def _network_request_will_be_sent(self, message):
        if self._behavior:
            self._behavior.notify_of_activity()
        if message["params"]["request"]["url"].lower().startswith("data:"):
            self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
        elif self.on_request:
            self.on_request(message)

    def _network_response_received(self, message):
        if (not self._reached_limit
                and message["params"]["response"]["status"] == 420
                and "Warcprox-Meta" in CaseInsensitiveDict(
                    message["params"]["response"]["headers"])):
            warcprox_meta = json.loads(CaseInsensitiveDict(
                message["params"]["response"]["headers"])["Warcprox-Meta"])
            self._reached_limit = brozzler.ReachedLimit(
                    warcprox_meta=warcprox_meta)
            self.logger.info("reached limit %s", self._reached_limit)
        if self.on_response:
            self.on_response(message)

    def _page_load_event_fired(self, message):
        def page_url_after_load_event(message):
            if message["result"]["result"]["value"] != self.url:
                if self.on_url_change:
                    self.on_url_change(message["result"]["result"]["value"])
        msg_id = self.send_to_chrome(
                method="Runtime.evaluate",
                params={"expression":"document.URL"})
        self._waiting_on_result_messages[msg_id] = page_url_after_load_event

        self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start(self.behavior_parameters)

    def _console_message_added(self, message):
        self.logger.debug("%s console.%s %s", self._websock.url,
                message["params"]["message"]["level"],
                message["params"]["message"]["text"])

    def _debugger_paused(self, message):
        # We hit the breakpoint set in visit_page. Get rid of google
        # analytics script!
        self.logger.debug("debugger paused! message={}".format(message))
        scriptId = message['params']['callFrames'][0]['location']['scriptId']

        # replace script
        self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})

        # resume execution
        self.send_to_chrome(method="Debugger.resume")

    def _handle_message(self, websock, json_message):
        message = json.loads(json_message)
        if "method" in message:
            if message["method"] == "Network.requestWillBeSent":
                self._network_request_will_be_sent(message)
            elif message["method"] == "Network.responseReceived":
                self._network_response_received(message)
            elif message["method"] == "Page.loadEventFired":
                self._page_load_event_fired(message)
            elif message["method"] == "Console.messageAdded":
                self._console_message_added(message)
            elif message["method"] == "Debugger.paused":
                self._debugger_paused(message)
            elif message["method"] == "Inspector.targetCrashed":
                self._aw_snap_hes_dead_jim = message
            # elif message["method"] in (
            #         "Network.dataReceived", "Network.responseReceived",
            #         "Network.loadingFinished"):
            #     pass
            # else:
            #     self.logger.debug("%s %s", message["method"], json_message)
        elif "result" in message:
            if message["id"] in self._waiting_on_result_messages:
                callback = self._waiting_on_result_messages[message["id"]]
                del self._waiting_on_result_messages[message["id"]]
                self.logger.debug(
                        "received result for message id=%s, calling %s",
                        message["id"], callback)
                callback(message)
            elif self._behavior and self._behavior.is_waiting_on_result(
                    message["id"]):
                self._behavior.notify_of_result(message)
Example #5
0
class Browser:
    """
    Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Should not be accessed from multiple threads.
    """

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
        self._behavior = None
        self._websock = None
        self._abort_browse_page = False
        self._chrome_instance = None
        self._aw_snap_hes_dead_jim = None
        self._work_dir = None
        self._websocket_url = None

    def __repr__(self):
        return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port)

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self, proxy=None):
        if not self._chrome_instance:
            # these can raise exceptions
            self.chrome_port = self._find_available_port()
            self._work_dir = tempfile.TemporaryDirectory()
            self._chrome_instance = Chrome(port=self.chrome_port,
                    executable=self.chrome_exe,
                    user_home_dir=self._work_dir.name,
                    user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
                    ignore_cert_errors=self.ignore_cert_errors,
                    proxy=proxy or self.proxy)
            self._websocket_url = self._chrome_instance.start()

    def stop(self):
        try:
            if self.is_running():
                self._chrome_instance.stop()
                self._chrome_instance = None
                try:
                    self._work_dir.cleanup()
                except:
                    self.logger.error("exception deleting %s", self._work_dir,
                                      exc_info=True)
                self._work_dir = None
                self._websocket_url = None
        except:
            self.logger.error("problem stopping", exc_info=True)

    def _find_available_port(self):
        port_available = False
        port = self.chrome_port

        try:
            conns = psutil.net_connections(kind="tcp")
        except psutil.AccessDenied:
            return port

        for p in range(port, 65535):
            if any(connection.laddr[1] == p for connection in conns):
                self.logger.warn("port %s already open, will try %s", p, p+1)
            else:
                port = p
                break
        return port

    def is_running(self):
        return bool(self._websocket_url)

    def abort_browse_page(self):
        self._abort_browse_page = True

    def browse_page(
            self, url, extra_headers=None, behavior_parameters=None,
            on_request=None, on_response=None, on_screenshot=None,
            on_url_change=None):
        """Synchronously loads a page, takes a screenshot, and runs behaviors.

        Raises BrowsingException if browsing the page fails in a non-critical
        way.

        Returns extracted outlinks.
        """
        if not self.is_running():
            raise BrowsingException("browser has not been started")
        self.url = url
        self.extra_headers = extra_headers
        self.on_request = on_request
        self.on_screenshot = on_screenshot
        self.on_url_change = on_url_change
        self.on_response = on_response
        self.behavior_parameters = behavior_parameters

        self._waiting_on_scroll_to_top_msg_id = None
        self._waiting_on_screenshot_msg_id = None
        self._waiting_on_document_url_msg_id = None
        self._waiting_on_outlinks_msg_id = None
        self._outlinks = None
        self._reached_limit = None
        self._aw_snap_hes_dead_jim = None
        self._abort_browse_page = False
        self._has_screenshot = False

        self._websock = websocket.WebSocketApp(self._websocket_url,
                on_open=self._visit_page, on_message=self._wrap_handle_message)

        threadName = "WebsockThread{}-{}".format(self.chrome_port,
                ''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
        websock_thread = threading.Thread(target=self._websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5})
        websock_thread.start()
        self._start = time.time()
        aborted = False

        try:
            while True:
                time.sleep(0.5)
                if self._browse_interval_func():
                    break

            while True:
                time.sleep(0.5)
                if self._post_behavior_interval_func():
                    return self._outlinks
        finally:
            if self._websock and self._websock.sock and self._websock.sock.connected:
                try:
                    self._websock.close()
                except BaseException as e:
                    self.logger.error("exception closing websocket {} - {}".format(self._websock, e))

            websock_thread.join(timeout=30)
            if websock_thread.is_alive():
                self.logger.error("{} still alive 30 seconds after closing {}, will forcefully nudge it again".format(websock_thread, self._websock))
                self._websock.keep_running = False
                websock_thread.join(timeout=30)
                if websock_thread.is_alive():
                    self.logger.critical("{} still alive 60 seconds after closing {}".format(websock_thread, self._websock))

            self._behavior = None

    def _post_behavior_interval_func(self):
        """Called periodically after behavior is finished on the page. Returns
        true when post-behavior tasks are finished."""
        if not self._has_screenshot and (
                not self._waiting_on_scroll_to_top_msg_id
                and not self._waiting_on_screenshot_msg_id):
            if time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
                self.logger.info(
                        "reached hard timeout of {} seconds url={}".format(
                            Browser.HARD_TIMEOUT_SECONDS, self.url))
            else:
                self.logger.info(
                        "behavior decided it's finished with %s", self.url)

            self.logger.info(
                    "scrolling to the top, then requesting screenshot %s",
                    self.url)
            self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome(
                    method="Runtime.evaluate",
                    params={"expression":"window.scrollTo(0, 0);"})
            return False
        elif not self._has_screenshot and (
                self._waiting_on_scroll_to_top_msg_id
                or self._waiting_on_screenshot_msg_id):
            return False

        if self._outlinks:
            self.logger.info("got outlinks, finished browsing %s", self.url)
            return True
        elif not self._waiting_on_outlinks_msg_id:
            self.logger.info("retrieving outlinks for %s", self.url)
            self._waiting_on_outlinks_msg_id = self.send_to_chrome(
                    method="Runtime.evaluate",
                    params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
            return False
        else: # self._waiting_on_outlinks_msg_id
            return False

    def _browse_interval_func(self):
        """Called periodically while page is being browsed. Returns True when
        finished browsing."""
        if not self._websock or not self._websock.sock or not self._websock.sock.connected:
            raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
        elif self._aw_snap_hes_dead_jim:
            raise BrowsingException("""chrome tab went "aw snap" or "he's dead jim"!""")
        elif (self._behavior != None and self._behavior.is_finished()
                or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS):
            return True
        elif self._reached_limit:
            raise self._reached_limit
        elif self._abort_browse_page:
            raise BrowsingAborted("browsing page aborted")
        else:
            return False

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self.command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs)
        if not suppress_logging:
            self.logger.debug('sending message to {}: {}'.format(self._websock, msg))
        self._websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        # navigate to about:blank here to avoid situation where we navigate to
        # the same page that we're currently on, perhaps with a different
        # #fragment, which prevents Page.loadEventFired from happening
        self.send_to_chrome(method="Page.navigate", params={"url": "about:blank"})

        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        if self.extra_headers:
            self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _wrap_handle_message(self, websock, message):
        try:
            self._handle_message(websock, message)
        except:
            self.logger.error("uncaught exception in _handle_message", exc_info=True)
            self.abort_browse_page()

    def _network_request_will_be_sent(self, message):
        if self._behavior:
            self._behavior.notify_of_activity()
        if message["params"]["request"]["url"].lower().startswith("data:"):
            self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
        elif self.on_request:
            self.on_request(message)

    def _network_response_received(self, message):
        if (not self._reached_limit
                and message["params"]["response"]["status"] == 420
                and "Warcprox-Meta" in CaseInsensitiveDict(
                    message["params"]["response"]["headers"])):
            warcprox_meta = json.loads(CaseInsensitiveDict(
                message["params"]["response"]["headers"])["Warcprox-Meta"])
            self._reached_limit = brozzler.ReachedLimit(
                    warcprox_meta=warcprox_meta)
            self.logger.info("reached limit %s", self._reached_limit)
        if self.on_response:
            self.on_response(message)

    def _page_load_event_fired(self, message):
        self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start()

        self._waiting_on_document_url_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"document.URL"})

    def _console_message_added(self, message):
        self.logger.debug("%s console.%s %s", self._websock.url,
                message["params"]["message"]["level"],
                message["params"]["message"]["text"])

    def _debugger_paused(self, message):
        # We hit the breakpoint set in visit_page. Get rid of google
        # analytics script!
        self.logger.debug("debugger paused! message={}".format(message))
        scriptId = message['params']['callFrames'][0]['location']['scriptId']

        # replace script
        self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})

        # resume execution
        self.send_to_chrome(method="Debugger.resume")

    def _handle_result_message(self, message):
        if message["id"] == self._waiting_on_screenshot_msg_id:
            if self.on_screenshot:
                self.on_screenshot(base64.b64decode(message["result"]["data"]))
            self._waiting_on_screenshot_msg_id = None
            self._has_screenshot = True
            self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url))
        elif message["id"] == self._waiting_on_scroll_to_top_msg_id:
            self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot")
            self._waiting_on_scroll_to_top_msg_id = None
        elif message["id"] == self._waiting_on_outlinks_msg_id:
            self.logger.debug("got outlinks message=%s", message)
            self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
        elif message["id"] == self._waiting_on_document_url_msg_id:
            if message["result"]["result"]["value"] != self.url:
                if self.on_url_change:
                    self.on_url_change(message["result"]["result"]["value"])
            self._waiting_on_document_url_msg_id = None
        elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
            self._behavior.notify_of_result(message)

    def _handle_message(self, websock, json_message):
        message = json.loads(json_message)
        if "method" in message and message["method"] == "Network.requestWillBeSent":
            self._network_request_will_be_sent(message)
        elif "method" in message and message["method"] == "Network.responseReceived":
            self._network_response_received(message)
        elif "method" in message and message["method"] == "Page.loadEventFired":
            self._page_load_event_fired(message)
        elif "method" in message and message["method"] == "Console.messageAdded":
            self._console_message_added(message)
        elif "method" in message and message["method"] == "Debugger.paused":
            self._debugger_paused(message)
        elif "method" in message and message["method"] == "Inspector.targetCrashed":
            self._aw_snap_hes_dead_jim = message
        elif "result" in message:
            self._handle_result_message(message)
Example #6
0
    def _page_load_event_fired(self, message):
        self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start()

        self._waiting_on_document_url_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"document.URL"})
Example #7
0
class Browser:
    """
    Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Should not be accessed from multiple threads.
    """

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(self,
                 chrome_port=9222,
                 chrome_exe='chromium-browser',
                 proxy=None,
                 ignore_cert_errors=False):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
        self._behavior = None
        self._websock = None
        self._abort_browse_page = False
        self._chrome_instance = None
        self._aw_snap_hes_dead_jim = None
        self._work_dir = None
        self._websocket_url = None

    def __repr__(self):
        return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__,
                                 self.chrome_port)

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self, proxy=None, cookie_db=None):
        if not self._chrome_instance:
            # these can raise exceptions
            self.chrome_port = self._find_available_port()
            self._work_dir = tempfile.TemporaryDirectory()
            if cookie_db is not None:
                cookie_dir = os.path.join(self._work_dir.name,
                                          "chrome-user-data", "Default")
                cookie_location = os.path.join(cookie_dir, "Cookies")
                self.logger.debug("cookie DB provided, writing to %s",
                                  cookie_location)
                os.makedirs(cookie_dir, exist_ok=True)

                try:
                    with open(cookie_location, 'wb') as cookie_file:
                        cookie_file.write(cookie_db)
                except OSError:
                    self.logger.error("exception writing cookie file at %s",
                                      cookie_location,
                                      exc_info=True)

            self._chrome_instance = Chrome(
                port=self.chrome_port,
                executable=self.chrome_exe,
                user_home_dir=self._work_dir.name,
                user_data_dir=os.sep.join(
                    [self._work_dir.name, "chrome-user-data"]),
                ignore_cert_errors=self.ignore_cert_errors,
                proxy=proxy or self.proxy)
            try:
                self._websocket_url = self._chrome_instance.start()
            except:
                self._chrome_instance = None
                raise

    def stop(self):
        try:
            if self.is_running():
                self._chrome_instance.stop()
                self._chrome_instance = None
                try:
                    self._work_dir.cleanup()
                except:
                    self.logger.error("exception deleting %s",
                                      self._work_dir,
                                      exc_info=True)
                self._work_dir = None
                self._websocket_url = None
        except:
            self.logger.error("problem stopping", exc_info=True)

    def persist_and_read_cookie_db(self):
        cookie_location = os.path.join(self._work_dir.name, "chrome-user-data",
                                       "Default", "Cookies")
        self.logger.debug(
            "marking cookies persistent then reading file into memory: %s ",
            cookie_location)
        try:
            with sqlite3.connect(cookie_location) as conn:
                cur = conn.cursor()
                cur.execute("UPDATE cookies SET persistent = 1")
        except sqlite3.Error:
            self.logger.error("exception updating cookie DB", exc_info=True)

        cookie_db = None
        try:
            with open(cookie_location, "rb") as cookie_file:
                cookie_db = cookie_file.read()
        except OSError:
            self.logger.error("exception reading from cookie DB file %s",
                              cookie_location,
                              exc_info=True)
        return cookie_db

    def _find_available_port(self):
        port_available = False
        port = self.chrome_port

        try:
            conns = psutil.net_connections(kind="tcp")
        except psutil.AccessDenied:
            return port

        for p in range(port, 65535):
            if any(connection.laddr[1] == p for connection in conns):
                self.logger.warn("port %s already open, will try %s", p, p + 1)
            else:
                port = p
                break
        return port

    def is_running(self):
        return bool(self._websocket_url)

    def abort_browse_page(self):
        self._abort_browse_page = True

    def browse_page(self,
                    url,
                    extra_headers=None,
                    behavior_parameters=None,
                    on_request=None,
                    on_response=None,
                    on_screenshot=None,
                    on_url_change=None):
        """
        Synchronously loads a page, takes a screenshot, and runs behaviors.

        Raises BrowsingException if browsing the page fails in a non-critical
        way.

        Returns extracted outlinks.
        """
        if not self.is_running():
            raise BrowsingException("browser has not been started")
        self.url = url
        self.extra_headers = extra_headers
        self.on_request = on_request
        self.on_screenshot = on_screenshot
        self.on_url_change = on_url_change
        self.on_response = on_response
        self.behavior_parameters = behavior_parameters

        self._outlinks = None
        self._reached_limit = None
        self._aw_snap_hes_dead_jim = None
        self._abort_browse_page = False
        self._has_screenshot = False
        self._waiting_on_result_messages = {}
        self._result_message_timeout = None

        self._websock = websocket.WebSocketApp(
            self._websocket_url,
            on_open=self._visit_page,
            on_message=self._wrap_handle_message)

        threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
            self.chrome_port, datetime.datetime.utcnow())
        websock_thread = threading.Thread(target=self._websock.run_forever,
                                          name=threadName,
                                          kwargs={'ping_timeout': 0.5})
        websock_thread.start()
        self._start = time.time()
        aborted = False

        try:
            while True:
                time.sleep(0.5)
                if self._browse_interval_func():
                    return self._outlinks
        finally:
            if (self._websock and self._websock.sock
                    and self._websock.sock.connected):
                try:
                    self._websock.close()
                except BaseException as e:
                    self.logger.error("exception closing websocket %s - %s" %
                                      (self._websock, e))

            websock_thread.join(timeout=30)
            if websock_thread.is_alive():
                self.logger.error(
                    "%s still alive 30 seconds after closing %s, will "
                    "forcefully nudge it again" %
                    (websock_thread, self._websock))
                self._websock.keep_running = False
                websock_thread.join(timeout=30)
                if websock_thread.is_alive():
                    self.logger.critical(
                        "%s still alive 60 seconds after closing %s" %
                        (websock_thread, self._websock))

            self._behavior = None

    OUTLINKS_JS = """
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
    __brzl_framesDone.add(frame);
    var outlinks = Array.prototype.slice.call(
            frame.document.querySelectorAll('a[href]'));
    for (var i = 0; i < frame.frames.length; i++) {
        if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
            outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i]));
        }
    }
    return outlinks;
}
__brzl_compileOutlinks(window).join(' ');
"""

    def _chain_chrome_messages(self, chain):
        """
        Sends a series of messages to chrome/chromium on the debugging protocol
        websocket. Waits for a reply from each one before sending the next.
        Enforces a timeout waiting for each reply. If the timeout is hit, sets
        self._result_message_timeout with a ResultMessageTimeout (an exception
        class). Takes an array of dicts, each of which should look like this:

            {
                "info": "human readable description",
                "chrome_msg": { ... },   # message to send to chrome, as a dict
                "timeout": 30,           # timeout in seconds
                "callback": my_callback, # takes one arg, the result message
            }

        The code is rather convoluted because of the asynchronous nature of the
        whole thing. See how it's used in _start_postbehavior_chain.
        """
        timer = None

        def callback(message):
            if timer:
                timer.cancel()
            if message["id"] in self._waiting_on_result_messages:
                del self._waiting_on_result_messages[message["id"]]
            if "callback" in chain[0]:
                chain[0]["callback"](message)
            self._chain_chrome_messages(chain[1:])

        def timeout():
            self._result_message_timeout = ResultMessageTimeout(
                "timed out after %.1fs waiting for result message "
                "for %s", chain[0]["timeout"], chain[0]["chrome_msg"])

        if chain:
            msg_id = self.send_to_chrome(**chain[0]["chrome_msg"])
            self._waiting_on_result_messages[msg_id] = callback
            self.logger.info("msg_id=%s for message %s", msg_id,
                             chain[0]["chrome_msg"])
            timer = threading.Timer(chain[0]["timeout"], timeout)
            timer.daemon = True
            timer.start()
        else:
            self.logger.info("finished chrome message chain")

    def _start_postbehavior_chain(self):
        if self.on_screenshot:
            chain = [{
                "info": "scrolling to top",
                "chrome_msg": {
                    "method": "Runtime.evaluate",
                    "params": {
                        "expression": "window.scrollTo(0, 0);"
                    },
                },
                "timeout": 30,
                "callback": lambda message: None,
            }, {
                "info":
                "requesting screenshot",
                "chrome_msg": {
                    "method": "Page.captureScreenshot"
                },
                "timeout":
                30,
                "callback":
                lambda message: (self.on_screenshot and self.on_screenshot(
                    base64.b64decode(message["result"]["data"]))),
            }]
        else:
            chain = []

        def set_outlinks(message):
            self._outlinks = frozenset(
                message["result"]["result"]["value"].split())

        chain.append({
            "info": "retrieving outlinks",
            "chrome_msg": {
                "method": "Runtime.evaluate",
                "params": {
                    "expression": self.OUTLINKS_JS
                },
            },
            "timeout": 60,
            "callback": set_outlinks,
        })

        self._chain_chrome_messages(chain)

    def _browse_interval_func(self):
        """Called periodically while page is being browsed. Returns True when
        finished browsing."""
        if (not self._websock or not self._websock.sock
                or not self._websock.sock.connected):
            raise BrowsingException(
                "websocket closed, did chrome die? {}".format(
                    self._websocket_url))
        elif self._result_message_timeout:
            raise self._result_message_timeout
        elif self._aw_snap_hes_dead_jim:
            raise BrowsingException(
                """chrome tab went "aw snap" or "he's dead jim"!""")
        elif self._outlinks is not None:
            # setting self._outlinks is the last thing that happens in the
            # post-behavior chain
            return True
        elif (self._behavior != None and self._behavior.is_finished()
              or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS):
            if self._behavior and self._behavior.is_finished():
                self.logger.info("behavior decided it's finished with %s",
                                 self.url)
            else:
                self.logger.info("reached hard timeout of %s seconds url=%s",
                                 Browser.HARD_TIMEOUT_SECONDS, self.url)
            self._behavior = None
            self._start_postbehavior_chain()
            return False
        elif self._reached_limit:
            raise self._reached_limit
        elif self._abort_browse_page:
            raise BrowsingAborted("browsing page aborted")
        else:
            return False

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self.command_id)
        kwargs["id"] = msg_id
        msg = json.dumps(kwargs)
        if not suppress_logging:
            self.logger.debug("sending message to %s: %s", self._websock, msg)
        self._websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        # navigate to about:blank here to avoid situation where we navigate to
        # the same page that we're currently on, perhaps with a different
        # #fragment, which prevents Page.loadEventFired from happening
        self.send_to_chrome(method="Page.navigate",
                            params={"url": "about:blank"})

        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        if self.extra_headers:
            self.send_to_chrome(method="Network.setExtraHTTPHeaders",
                                params={"headers": self.extra_headers})

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(
            method="Debugger.setBreakpointByUrl",
            params={
                "lineNumber": 1,
                "urlRegex": "https?://www.google-analytics.com/analytics.js"
            })

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _wrap_handle_message(self, websock, message):
        try:
            self._handle_message(websock, message)
        except:
            self.logger.error(
                "uncaught exception in _handle_message message=%s",
                message,
                exc_info=True)
            self.abort_browse_page()

    def _network_request_will_be_sent(self, message):
        if self._behavior:
            self._behavior.notify_of_activity()
        if message["params"]["request"]["url"].lower().startswith("data:"):
            self.logger.debug("ignoring data url {}".format(
                message["params"]["request"]["url"][:80]))
        elif self.on_request:
            self.on_request(message)

    def _network_response_received(self, message):
        if (not self._reached_limit
                and message["params"]["response"]["status"] == 420
                and "Warcprox-Meta" in CaseInsensitiveDict(
                    message["params"]["response"]["headers"])):
            warcprox_meta = json.loads(
                CaseInsensitiveDict(
                    message["params"]["response"]["headers"])["Warcprox-Meta"])
            self._reached_limit = brozzler.ReachedLimit(
                warcprox_meta=warcprox_meta)
            self.logger.info("reached limit %s", self._reached_limit)
        if self.on_response:
            self.on_response(message)

    def _page_load_event_fired(self, message):
        self.logger.info(
            "Page.loadEventFired, moving on to starting behaviors url={}".
            format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start(self.behavior_parameters)

        self._waiting_on_document_url_msg_id = self.send_to_chrome(
            method="Runtime.evaluate", params={"expression": "document.URL"})

    def _console_message_added(self, message):
        self.logger.debug("%s console.%s %s", self._websock.url,
                          message["params"]["message"]["level"],
                          message["params"]["message"]["text"])

    def _debugger_paused(self, message):
        # We hit the breakpoint set in visit_page. Get rid of google
        # analytics script!
        self.logger.debug("debugger paused! message={}".format(message))
        scriptId = message['params']['callFrames'][0]['location']['scriptId']

        # replace script
        self.send_to_chrome(method="Debugger.setScriptSource",
                            params={
                                "scriptId":
                                scriptId,
                                "scriptSource":
                                "console.log('google analytics is no more!');"
                            })

        # resume execution
        self.send_to_chrome(method="Debugger.resume")

    def _handle_message(self, websock, json_message):
        message = json.loads(json_message)
        if "method" in message:
            if message["method"] == "Network.requestWillBeSent":
                self._network_request_will_be_sent(message)
            elif message["method"] == "Network.responseReceived":
                self._network_response_received(message)
            elif message["method"] == "Page.loadEventFired":
                self._page_load_event_fired(message)
            elif message["method"] == "Console.messageAdded":
                self._console_message_added(message)
            elif message["method"] == "Debugger.paused":
                self._debugger_paused(message)
            elif message["method"] == "Inspector.targetCrashed":
                self._aw_snap_hes_dead_jim = message
            # elif message["method"] in (
            #         "Network.dataReceived", "Network.responseReceived",
            #         "Network.loadingFinished"):
            #     pass
            # else:
            #     self.logger.debug("%s %s", message["method"], json_message)
        elif "result" in message:
            if message["id"] in self._waiting_on_result_messages:
                callback = self._waiting_on_result_messages[message["id"]]
                self.logger.debug(
                    "received result for message id=%s, calling %s",
                    message["id"], callback)
                callback(message)
            elif self._behavior and self._behavior.is_waiting_on_result(
                    message["id"]):
                self._behavior.notify_of_result(message)