Example #1
0
    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websock_url = None
        self.websock = None
        self.websock_thread = None
        self.is_browsing = False
        self._command_id = Counter()
Example #2
0
 def start(self, proxy=None, cookie_db=None):
     if not self._chrome_instance:
         self._chrome_instance = Chrome(
                 port=self.chrome_port, executable=self.chrome_exe,
                 ignore_cert_errors=self.ignore_cert_errors,
                 proxy=proxy or self.proxy, cookie_db=None)
         try:
             self._websocket_url = self._chrome_instance.start()
         except:
             self._chrome_instance = None
             raise
Example #3
0
    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websocket_url = None
        self.is_browsing = False
        self._browser_controller = None
Example #4
0
    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websock_url = None
        self.websock = None
        self.websock_thread = None
        self.is_browsing = False
        self._command_id = Counter()
        self._wait_interval = 0.5
Example #5
0
class Browser:
    """
    Runs chrome/chromium to synchronously browse one page at a time using
    worker.browse_page(). Should not be accessed from multiple threads.
    """

    logger = logging.getLogger(__module__ + "." + __qualname__)

    HARD_TIMEOUT_SECONDS = 20 * 60

    def __init__(
            self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None,
            ignore_cert_errors=False):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
        self._behavior = None
        self._websock = None
        self._abort_browse_page = False
        self._chrome_instance = None
        self._aw_snap_hes_dead_jim = None
        self._work_dir = None
        self._websocket_url = None

    def __repr__(self):
        return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port)

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self, proxy=None, cookie_db=None):
        if not self._chrome_instance:
            self._chrome_instance = Chrome(
                    port=self.chrome_port, executable=self.chrome_exe,
                    ignore_cert_errors=self.ignore_cert_errors,
                    proxy=proxy or self.proxy, cookie_db=None)
            try:
                self._websocket_url = self._chrome_instance.start()
            except:
                self._chrome_instance = None
                raise

    def stop(self):
        try:
            if self.is_running():
                self._chrome_instance.stop()
                self._chrome_instance = None
                self._websocket_url = None
        except:
            self.logger.error("problem stopping", exc_info=True)

    def is_running(self):
        return bool(self._websocket_url)

    def abort_browse_page(self):
        self._abort_browse_page = True

    def persist_and_read_cookie_db(self):
        if self._chrome_instance:
            return self._chrome_instance.persist_and_read_cookie_db()
        else:
            return None

    def browse_page(
            self, url, extra_headers=None, behavior_parameters=None,
            user_agent=None,
            on_request=None, on_response=None, on_screenshot=None,
            on_url_change=None):
        """
        Synchronously loads a page, runs behaviors, and takes a screenshot.

        Raises BrowsingException if browsing the page fails in a non-critical
        way.

        Returns extracted outlinks.
        """
        if not self.is_running():
            raise BrowsingException("browser has not been started")
        self.url = url
        self.extra_headers = extra_headers
        self.user_agent = user_agent
        self.on_request = on_request
        self.on_screenshot = on_screenshot
        self.on_url_change = on_url_change
        self.on_response = on_response
        self.behavior_parameters = behavior_parameters

        self._outlinks = None
        self._reached_limit = None
        self._aw_snap_hes_dead_jim = None
        self._abort_browse_page = False
        self._has_screenshot = False
        self._waiting_on_result_messages = {}
        self._result_message_timeout = None

        self._websock = websocket.WebSocketApp(
                self._websocket_url, on_open=self._visit_page,
                on_message=self._wrap_handle_message)

        thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
                self.chrome_port, datetime.datetime.utcnow())
        websock_thread = threading.Thread(
                target=self._websock.run_forever, name=thread_name,
                kwargs={'ping_timeout':0.5})
        websock_thread.start()
        self._start = time.time()
        aborted = False

        try:
            while True:
                time.sleep(0.5)
                if self._browse_interval_func():
                    return self._outlinks
        finally:
            if (self._websock and self._websock.sock
                    and self._websock.sock.connected):
                try:
                    self._websock.close()
                except BaseException as e:
                    self.logger.error(
                            "exception closing websocket %s - %s" % (
                                self._websock, e))

            websock_thread.join(timeout=30)
            if websock_thread.is_alive():
                self.logger.error(
                        "%s still alive 30 seconds after closing %s, will "
                        "forcefully nudge it again" % (
                            websock_thread, self._websock))
                self._websock.keep_running = False
                websock_thread.join(timeout=30)
                if websock_thread.is_alive():
                    self.logger.critical(
                            "%s still alive 60 seconds after closing %s" % (
                                websock_thread, self._websock))

            self._behavior = None

    OUTLINKS_JS = r"""
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
    __brzl_framesDone.add(frame);
    if (frame && frame.document) {
        var outlinks = Array.prototype.slice.call(
                frame.document.querySelectorAll('a[href]'));
        for (var i = 0; i < frame.frames.length; i++) {
            if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
                outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i]));
            }
        }
    }
    return outlinks;
}
__brzl_compileOutlinks(window).join('\n');
"""

    def _chain_chrome_messages(self, chain):
        """
        Sends a series of messages to chrome/chromium on the debugging protocol
        websocket. Waits for a reply from each one before sending the next.
        Enforces a timeout waiting for each reply. If the timeout is hit, sets
        self._result_message_timeout with a ResultMessageTimeout (an exception
        class). Takes an array of dicts, each of which should look like this:

            {
                "info": "human readable description",
                "chrome_msg": { ... },   # message to send to chrome, as a dict
                "timeout": 30,           # timeout in seconds
                "callback": my_callback, # takes one arg, the result message
            }

        The code is rather convoluted because of the asynchronous nature of the
        whole thing. See how it's used in _start_postbehavior_chain.
        """
        timer = None

        def callback(message):
            if timer:
                timer.cancel()
            if "callback" in chain[0]:
                chain[0]["callback"](message)
            self._chain_chrome_messages(chain[1:])

        def timeout():
            self._result_message_timeout = ResultMessageTimeout(
                    "timed out after %.1fs waiting for result message "
                    "for %s", chain[0]["timeout"], chain[0]["chrome_msg"])

        if chain:
            msg_id = self.send_to_chrome(**chain[0]["chrome_msg"])
            self._waiting_on_result_messages[msg_id] = callback
            self.logger.info(
                    "msg_id=%s for message %s", msg_id, chain[0]["chrome_msg"])
            timer = threading.Timer(chain[0]["timeout"], timeout)
            timer.daemon = True
            timer.start()
        else:
            self.logger.info("finished chrome message chain")

    def _start_postbehavior_chain(self):
        if self.on_screenshot:
            chain = [{
                "info": "scrolling to top",
                "chrome_msg": {
                    "method": "Runtime.evaluate",
                    "params": {"expression": "window.scrollTo(0, 0);"},
                },
                "timeout": 30,
                "callback": lambda message: None,
            }, {
                "info": "requesting screenshot",
                "chrome_msg": {"method": "Page.captureScreenshot"},
                "timeout": 30,
                "callback": lambda message: (
                        self.on_screenshot and self.on_screenshot(
                            base64.b64decode(message["result"]["data"]))),
            }]
        else:
            chain = []

        def set_outlinks(message):
            if message["result"]["result"]["value"]:
                self._outlinks = frozenset(
                        message["result"]["result"]["value"].split("\n"))
            else:
                self._outlinks = frozenset()

        chain.append({
            "info": "retrieving outlinks",
            "chrome_msg": {
                "method": "Runtime.evaluate",
                "params": {"expression": self.OUTLINKS_JS},
            },
            "timeout": 60,
            "callback": set_outlinks,
        })

        self._chain_chrome_messages(chain)

    def _browse_interval_func(self):
        """Called periodically while page is being browsed. Returns True when
        finished browsing."""
        if (not self._websock or not self._websock.sock
                or not self._websock.sock.connected):
            raise BrowsingException(
                    "websocket closed, did chrome die? {}".format(
                        self._websocket_url))
        elif self._result_message_timeout:
            raise self._result_message_timeout
        elif self._aw_snap_hes_dead_jim:
            raise BrowsingException(
                    """chrome tab went "aw snap" or "he's dead jim"!""")
        elif self._outlinks is not None:
            # setting self._outlinks is the last thing that happens in the
            # post-behavior chain
            return True
        elif (self._behavior != None and self._behavior.is_finished()
                or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS):
            if self._behavior and self._behavior.is_finished():
                self.logger.info(
                        "behavior decided it's finished with %s", self.url)
            else:
                self.logger.info(
                        "reached hard timeout of %s seconds url=%s",
                        Browser.HARD_TIMEOUT_SECONDS, self.url)
            self._behavior = None
            self._start_postbehavior_chain()
            return False
        elif self._reached_limit:
            raise self._reached_limit
        elif self._abort_browse_page:
            raise BrowsingAborted("browsing page aborted")
        else:
            return False

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self.command_id)
        kwargs["id"] = msg_id
        msg = json.dumps(kwargs)
        if not suppress_logging:
            self.logger.debug("sending message to %s: %s", self._websock, msg)
        self._websock.send(msg)
        return msg_id

    def _visit_page(self, websock):
        # navigate to about:blank here to avoid situation where we navigate to
        # the same page that we're currently on, perhaps with a different
        # #fragment, which prevents Page.loadEventFired from happening
        self.send_to_chrome(method="Page.navigate", params={"url": "about:blank"})

        self.send_to_chrome(method="Network.enable")
        self.send_to_chrome(method="Page.enable")
        self.send_to_chrome(method="Console.enable")
        self.send_to_chrome(method="Debugger.enable")
        self.send_to_chrome(method="Runtime.enable")

        headers = self.extra_headers or {}
        headers['Accept-Encoding'] = 'identity'
        self.send_to_chrome(
                method="Network.setExtraHTTPHeaders",
                params={"headers":headers})

        if self.user_agent:
            self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})

        # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
        self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

        # navigate to the page!
        self.send_to_chrome(method="Page.navigate", params={"url": self.url})

    def _wrap_handle_message(self, websock, message):
        try:
            self._handle_message(websock, message)
        except:
            self.logger.error(
                    "uncaught exception in _handle_message message=%s",
                    message, exc_info=True)
            self.abort_browse_page()

    def _network_request_will_be_sent(self, message):
        if self._behavior:
            self._behavior.notify_of_activity()
        if message["params"]["request"]["url"].lower().startswith("data:"):
            self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
        elif self.on_request:
            self.on_request(message)

    def _network_response_received(self, message):
        if (not self._reached_limit
                and message["params"]["response"]["status"] == 420
                and "Warcprox-Meta" in CaseInsensitiveDict(
                    message["params"]["response"]["headers"])):
            warcprox_meta = json.loads(CaseInsensitiveDict(
                message["params"]["response"]["headers"])["Warcprox-Meta"])
            self._reached_limit = brozzler.ReachedLimit(
                    warcprox_meta=warcprox_meta)
            self.logger.info("reached limit %s", self._reached_limit)
        if self.on_response:
            self.on_response(message)

    def _page_load_event_fired(self, message):
        def page_url_after_load_event(message):
            if message["result"]["result"]["value"] != self.url:
                if self.on_url_change:
                    self.on_url_change(message["result"]["result"]["value"])
        msg_id = self.send_to_chrome(
                method="Runtime.evaluate",
                params={"expression":"document.URL"})
        self._waiting_on_result_messages[msg_id] = page_url_after_load_event

        self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
        self._behavior = Behavior(self.url, self)
        self._behavior.start(self.behavior_parameters)

    def _console_message_added(self, message):
        self.logger.debug("%s console.%s %s", self._websock.url,
                message["params"]["message"]["level"],
                message["params"]["message"]["text"])

    def _debugger_paused(self, message):
        # We hit the breakpoint set in visit_page. Get rid of google
        # analytics script!
        self.logger.debug("debugger paused! message={}".format(message))
        scriptId = message['params']['callFrames'][0]['location']['scriptId']

        # replace script
        self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})

        # resume execution
        self.send_to_chrome(method="Debugger.resume")

    def _handle_message(self, websock, json_message):
        message = json.loads(json_message)
        if "method" in message:
            if message["method"] == "Network.requestWillBeSent":
                self._network_request_will_be_sent(message)
            elif message["method"] == "Network.responseReceived":
                self._network_response_received(message)
            elif message["method"] == "Page.loadEventFired":
                self._page_load_event_fired(message)
            elif message["method"] == "Console.messageAdded":
                self._console_message_added(message)
            elif message["method"] == "Debugger.paused":
                self._debugger_paused(message)
            elif message["method"] == "Inspector.targetCrashed":
                self._aw_snap_hes_dead_jim = message
            # elif message["method"] in (
            #         "Network.dataReceived", "Network.responseReceived",
            #         "Network.loadingFinished"):
            #     pass
            # else:
            #     self.logger.debug("%s %s", message["method"], json_message)
        elif "result" in message:
            if message["id"] in self._waiting_on_result_messages:
                callback = self._waiting_on_result_messages[message["id"]]
                del self._waiting_on_result_messages[message["id"]]
                self.logger.debug(
                        "received result for message id=%s, calling %s",
                        message["id"], callback)
                callback(message)
            elif self._behavior and self._behavior.is_waiting_on_result(
                    message["id"]):
                self._behavior.notify_of_result(message)
Example #6
0
class Browser:
    '''
    Manages an instance of Chrome for browsing pages.
    '''
    logger = logging.getLogger(__module__ + '.' + __qualname__)

    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websock_url = None
        self.websock = None
        self.websock_thread = None
        self.is_browsing = False
        self._command_id = Counter()
        self._wait_interval = 0.5

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def _wait_for(self, callback, timeout=None):
        '''
        Spins until callback() returns truthy.
        '''
        start = time.time()
        while True:
            if callback():
                return
            elapsed = time.time() - start
            if timeout and elapsed > timeout:
                raise BrowsingTimeout(
                        'timed out after %.1fs waiting for: %s' % (
                            elapsed, callback))
            brozzler.sleep(self._wait_interval)

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self._command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs, separators=',:')
        logging.log(
                logging.TRACE if suppress_logging else logging.DEBUG,
                'sending message to %s: %s', self.websock, msg)
        self.websock.send(msg)
        return msg_id

    def start(self, **kwargs):
        '''
        Starts chrome if it's not running.

        Args:
            **kwargs: arguments for self.chrome.start(...)
        '''
        if not self.is_running():
            self.websock_url = self.chrome.start(**kwargs)
            self.websock = websocket.WebSocketApp(self.websock_url)
            self.websock_thread = WebsockReceiverThread(
                    self.websock, name='WebsockThread:%s' % self.chrome.port)
            self.websock_thread.start()

            self._wait_for(lambda: self.websock_thread.is_open, timeout=30)

            # tell browser to send us messages we're interested in
            self.send_to_chrome(method='Network.enable')
            self.send_to_chrome(method='Page.enable')
            self.send_to_chrome(method='Console.enable')
            self.send_to_chrome(method='Runtime.enable')

            # disable google analytics
            self.send_to_chrome(
                method='Network.setBlockedURLs',
                params={'urls': ['*google-analytics.com/analytics.js',
                                 '*google-analytics.com/ga.js']}
                )

    def stop(self):
        '''
        Stops chrome if it's running.
        '''
        try:
            if (self.websock and self.websock.sock
                    and self.websock.sock.connected):
                self.logger.info('shutting down websocket connection')
                try:
                    self.websock.close()
                except BaseException as e:
                    self.logger.error(
                            'exception closing websocket %s - %s',
                            self.websock, e)

            self.chrome.stop()

            if self.websock_thread and (
                    self.websock_thread != threading.current_thread()):
                self.websock_thread.join(timeout=30)
                if self.websock_thread.is_alive():
                    self.logger.error(
                            '%s still alive 30 seconds after closing %s, will '
                            'forcefully nudge it again', self.websock_thread,
                            self.websock)
                    self.websock.keep_running = False
                    self.websock_thread.join(timeout=30)
                    if self.websock_thread.is_alive():
                        self.logger.critical(
                                '%s still alive 60 seconds after closing %s',
                                    self.websock_thread, self.websock)

            self.websock_url = None
        except:
            self.logger.error('problem stopping', exc_info=True)

    def is_running(self):
        return self.websock_url is not None

    def browse_page(
            self, page_url, extra_headers=None,
            user_agent=None, behavior_parameters=None, behaviors_dir=None,
            on_request=None, on_response=None, on_screenshot=None,
            username=None, password=None, hashtags=None,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                        extra_headers=extra_headers,
                        user_agent=user_agent)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                if on_screenshot:
                    self._try_screenshot(on_screenshot)
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters,
                        behaviors_dir=behaviors_dir)
                self.run_behavior(behavior_script, timeout=behavior_timeout)
                if skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None

    def _try_screenshot(self, on_screenshot):
        for i in range(3):
            try:
                jpeg_bytes = self.screenshot()
                on_screenshot(jpeg_bytes)
                return
            except BrowsingTimeout as e:
                logging.error('attempt %s/3: %s', i+1, e)

    def visit_hashtags(self, page_url, hashtags, outlinks):
        _hashtags = set(hashtags or [])
        for outlink in outlinks:
            url = urlcanon.whatwg(outlink)
            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url)
            if hashtag and str(url) == page_url:
                _hashtags.add(hashtag)
        # could inject a script that listens for HashChangeEvent to figure
        # out which hashtags were visited already and skip those
        for hashtag in _hashtags:
            # navigate_to_hashtag (nothing to wait for so no timeout?)
            self.logger.debug('navigating to hashtag %s', hashtag)
            url = urlcanon.whatwg(page_url)
            url.hash_sign = b'#'
            url.fragment = hashtag[1:].encode('utf-8')
            self.send_to_chrome(
                    method='Page.navigate', params={'url': str(url)})
            time.sleep(5) # um.. wait for idleness or something?
            # take another screenshot?
            # run behavior again with short timeout?
            # retrieve outlinks again and append to list?

    def configure_browser(self, extra_headers=None, user_agent=None):
        headers = extra_headers or {}
        headers['Accept-Encoding'] = 'gzip'  # avoid encodings br, sdch
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(
                method='Network.setExtraHTTPHeaders',
                params={'headers': headers})
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=10)
        if user_agent:
            msg_id = self.send_to_chrome(
                    method='Network.setUserAgentOverride',
                    params={'userAgent': user_agent})

    def navigate_to_page(self, page_url, timeout=300):
        self.logger.info('navigating to page %s', page_url)
        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(method='Page.navigate', params={'url': page_url})
        self._wait_for(
                lambda: self.websock_thread.got_page_load_event,
                timeout=timeout)

    def extract_outlinks(self, timeout=60):
        self.logger.info('extracting outlinks')
        self.websock_thread.expect_result(self._command_id.peek())
        js = brozzler.jinja2_environment().get_template(
                'extract-outlinks.js').render()
        msg_id = self.send_to_chrome(
                method='Runtime.evaluate', params={'expression': js})
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        if ('result' in message and 'result' in message['result']
                and 'value' in message['result']['result']):
            if message['result']['result']['value']:
                return frozenset(
                        message['result']['result']['value'].split('\n'))
            else:
                # no links found
                return frozenset()
        else:
            self.logger.error(
                    'problem extracting outlinks, result message: %s', message)
            return frozenset()

    def screenshot(self, timeout=45):
        self.logger.info('taking screenshot')
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(method='Page.captureScreenshot')
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        jpeg_bytes = base64.b64decode(message['result']['data'])
        return jpeg_bytes

    def url(self, timeout=30):
        '''
        Returns value of document.URL from the browser.
        '''
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(
                method='Runtime.evaluate',
                params={'expression': 'document.URL'})
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        return message['result']['result']['value']

    def run_behavior(self, behavior_script, timeout=900):
        self.send_to_chrome(
                method='Runtime.evaluate', suppress_logging=True,
                params={'expression': behavior_script})

        start = time.time()
        while True:
            elapsed = time.time() - start
            if elapsed > timeout:
                logging.info(
                        'behavior reached hard timeout after %.1fs', elapsed)
                return

            brozzler.sleep(7)

            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
                     method='Runtime.evaluate', suppress_logging=True,
                     params={'expression': 'umbraBehaviorFinished()'})
            try:
                self._wait_for(
                        lambda: self.websock_thread.received_result(msg_id),
                        timeout=5)
                msg = self.websock_thread.pop_result(msg_id)
                if (msg and 'result' in msg
                        and not ('exceptionDetails' in msg['result'])
                        and not ('wasThrown' in msg['result']
                            and msg['result']['wasThrown'])
                        and 'result' in msg['result']
                        and type(msg['result']['result']['value']) == bool
                        and msg['result']['result']['value']):
                    self.logger.info('behavior decided it has finished')
                    return
            except BrowsingTimeout:
                pass

    def try_login(self, username, password, timeout=300):
        try_login_js = brozzler.jinja2_environment().get_template(
                'try-login.js.j2').render(username=username, password=password)

        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(
                method='Runtime.evaluate', suppress_logging=True,
                params={'expression': try_login_js})

        # wait for tryLogin to finish trying (should be very very quick)
        start = time.time()
        while True:
            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
                method='Runtime.evaluate',
                params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'})
            try:
                self._wait_for(
                        lambda: self.websock_thread.received_result(msg_id),
                        timeout=5)
                msg = self.websock_thread.pop_result(msg_id)
                if (msg and 'result' in msg
                        and 'result' in msg['result']):
                    result = msg['result']['result']['value']
                    if result == 'login-form-not-found':
                        # we're done
                        return
                    elif result in ('submitted-form', 'maybe-submitted-form'):
                        # wait for page load event below
                        self.logger.info(
                                'submitted a login form, waiting for another '
                                'page load event')
                        break
                    # else try again to get __brzl_tryLoginState

            except BrowsingTimeout:
                pass

            if time.time() - start > 30:
                raise BrowsingException(
                        'timed out trying to check if tryLogin finished')

        # if we get here, we submitted a form, now we wait for another page
        # load event
        self._wait_for(
                lambda: self.websock_thread.got_page_load_event,
                timeout=timeout)
Example #7
0
class Browser:
    '''
    Manages an instance of Chrome for browsing pages.
    '''
    logger = logging.getLogger(__module__ + '.' + __qualname__)

    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websock_url = None
        self.websock = None
        self.websock_thread = None
        self.is_browsing = False
        self._command_id = Counter()
        self._wait_interval = 0.5

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def _wait_for(self, callback, timeout=None):
        '''
        Spins until callback() returns truthy.
        '''
        start = time.time()
        while True:
            if callback():
                return
            elapsed = time.time() - start
            if timeout and elapsed > timeout:
                raise BrowsingTimeout(
                        'timed out after %.1fs waiting for: %s' % (
                            elapsed, callback))
            brozzler.sleep(self._wait_interval)

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self._command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs, separators=',:')
        logging.log(
                logging.TRACE if suppress_logging else logging.DEBUG,
                'sending message to %s: %s', self.websock, msg)
        self.websock.send(msg)
        return msg_id

    def start(self, **kwargs):
        '''
        Starts chrome if it's not running.

        Args:
            **kwargs: arguments for self.chrome.start(...)
        '''
        if not self.is_running():
            self.websock_url = self.chrome.start(**kwargs)
            self.websock = websocket.WebSocketApp(self.websock_url)
            self.websock_thread = WebsockReceiverThread(
                    self.websock, name='WebsockThread:%s' % self.chrome.port)
            self.websock_thread.start()

            self._wait_for(lambda: self.websock_thread.is_open, timeout=30)

            # tell browser to send us messages we're interested in
            self.send_to_chrome(method='Network.enable')
            self.send_to_chrome(method='Page.enable')
            self.send_to_chrome(method='Console.enable')
            self.send_to_chrome(method='Runtime.enable')
            self.send_to_chrome(method='ServiceWorker.enable')
            self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad')

            # disable google analytics
            self.send_to_chrome(
                method='Network.setBlockedURLs',
                params={'urls': ['*google-analytics.com/analytics.js',
                                 '*google-analytics.com/ga.js']})

    def stop(self):
        '''
        Stops chrome if it's running.
        '''
        try:
            if (self.websock and self.websock.sock
                    and self.websock.sock.connected):
                self.logger.info('shutting down websocket connection')
                try:
                    self.websock.close()
                except BaseException as e:
                    self.logger.error(
                            'exception closing websocket %s - %s',
                            self.websock, e)

            self.chrome.stop()

            if self.websock_thread and (
                    self.websock_thread != threading.current_thread()):
                self.websock_thread.join(timeout=30)
                if self.websock_thread.is_alive():
                    self.logger.error(
                            '%s still alive 30 seconds after closing %s, will '
                            'forcefully nudge it again', self.websock_thread,
                            self.websock)
                    self.websock.keep_running = False
                    self.websock_thread.join(timeout=30)
                    if self.websock_thread.is_alive():
                        self.logger.critical(
                                '%s still alive 60 seconds after closing %s',
                                    self.websock_thread, self.websock)

            self.websock_url = None
        except:
            self.logger.error('problem stopping', exc_info=True)

    def is_running(self):
        return self.websock_url is not None

    def browse_page(
            self, page_url, extra_headers=None,
            user_agent=None, behavior_parameters=None, behaviors_dir=None,
            on_request=None, on_response=None,
            on_service_worker_version_updated=None, on_screenshot=None,
            username=None, password=None, hashtags=None,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_service_worker_version_updated: callback to invoke on every
                ServiceWorker.workerVersionUpdated event, takes one argument,
                the json-decoded message (default None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)
            username: username string to use to try logging in if a login form
                is found in the page (default None)
            password: password string to use to try logging in if a login form
                is found in the page (default None)
            ... (there are more)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
            self.websock_thread.on_service_worker_version_updated = \
                    on_service_worker_version_updated
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                        extra_headers=extra_headers,
                        user_agent=user_agent)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                if on_screenshot:
                    self._try_screenshot(on_screenshot)
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters,
                        behaviors_dir=behaviors_dir)
                self.run_behavior(behavior_script, timeout=behavior_timeout)
                if skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None

    def _try_screenshot(self, on_screenshot):
        for i in range(3):
            try:
                jpeg_bytes = self.screenshot()
                on_screenshot(jpeg_bytes)
                return
            except BrowsingTimeout as e:
                logging.error('attempt %s/3: %s', i+1, e)

    def visit_hashtags(self, page_url, hashtags, outlinks):
        _hashtags = set(hashtags or [])
        for outlink in outlinks:
            url = urlcanon.whatwg(outlink)
            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url)
            if hashtag and str(url) == page_url:
                _hashtags.add(hashtag)
        # could inject a script that listens for HashChangeEvent to figure
        # out which hashtags were visited already and skip those
        for hashtag in _hashtags:
            # navigate_to_hashtag (nothing to wait for so no timeout?)
            self.logger.debug('navigating to hashtag %s', hashtag)
            url = urlcanon.whatwg(page_url)
            url.hash_sign = b'#'
            url.fragment = hashtag[1:].encode('utf-8')
            self.send_to_chrome(
                    method='Page.navigate', params={'url': str(url)})
            time.sleep(5) # um.. wait for idleness or something?
            # take another screenshot?
            # run behavior again with short timeout?
            # retrieve outlinks again and append to list?

    def configure_browser(self, extra_headers=None, user_agent=None):
        headers = extra_headers or {}
        headers['Accept-Encoding'] = 'gzip'  # avoid encodings br, sdch
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(
                method='Network.setExtraHTTPHeaders',
                params={'headers': headers})
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=10)
        if user_agent:
            msg_id = self.send_to_chrome(
                    method='Network.setUserAgentOverride',
                    params={'userAgent': user_agent})

    def navigate_to_page(self, page_url, timeout=300):
        self.logger.info('navigating to page %s', page_url)
        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(method='Page.navigate', params={'url': page_url})
        self._wait_for(
                lambda: self.websock_thread.got_page_load_event,
                timeout=timeout)

    def extract_outlinks(self, timeout=60):
        self.logger.info('extracting outlinks')
        self.websock_thread.expect_result(self._command_id.peek())
        js = brozzler.jinja2_environment().get_template(
                'extract-outlinks.js').render()
        msg_id = self.send_to_chrome(
                method='Runtime.evaluate', params={'expression': js})
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        if ('result' in message and 'result' in message['result']
                and 'value' in message['result']['result']):
            if message['result']['result']['value']:
                return frozenset(
                        message['result']['result']['value'].split('\n'))
            else:
                # no links found
                return frozenset()
        else:
            self.logger.error(
                    'problem extracting outlinks, result message: %s', message)
            return frozenset()

    def screenshot(self, timeout=45):
        self.logger.info('taking screenshot')
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(method='Page.captureScreenshot')
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        jpeg_bytes = base64.b64decode(message['result']['data'])
        return jpeg_bytes

    def url(self, timeout=30):
        '''
        Returns value of document.URL from the browser.
        '''
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(
                method='Runtime.evaluate',
                params={'expression': 'document.URL'})
        self._wait_for(
                lambda: self.websock_thread.received_result(msg_id),
                timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        return message['result']['result']['value']

    def run_behavior(self, behavior_script, timeout=900):
        self.send_to_chrome(
                method='Runtime.evaluate', suppress_logging=True,
                params={'expression': behavior_script})

        start = time.time()
        while True:
            elapsed = time.time() - start
            if elapsed > timeout:
                logging.info(
                        'behavior reached hard timeout after %.1fs', elapsed)
                return

            brozzler.sleep(7)

            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
                     method='Runtime.evaluate', suppress_logging=True,
                     params={'expression': 'umbraBehaviorFinished()'})
            try:
                self._wait_for(
                        lambda: self.websock_thread.received_result(msg_id),
                        timeout=5)
                msg = self.websock_thread.pop_result(msg_id)
                if (msg and 'result' in msg
                        and not ('exceptionDetails' in msg['result'])
                        and not ('wasThrown' in msg['result']
                            and msg['result']['wasThrown'])
                        and 'result' in msg['result']
                        and type(msg['result']['result']['value']) == bool
                        and msg['result']['result']['value']):
                    self.logger.info('behavior decided it has finished')
                    return
            except BrowsingTimeout:
                pass

    def try_login(self, username, password, timeout=300):
        try_login_js = brozzler.jinja2_environment().get_template(
                'try-login.js.j2').render(username=username, password=password)

        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(
                method='Runtime.evaluate', suppress_logging=True,
                params={'expression': try_login_js})

        # wait for tryLogin to finish trying (should be very very quick)
        start = time.time()
        while True:
            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
                method='Runtime.evaluate',
                params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'})
            try:
                self._wait_for(
                        lambda: self.websock_thread.received_result(msg_id),
                        timeout=5)
                msg = self.websock_thread.pop_result(msg_id)
                if (msg and 'result' in msg
                        and 'result' in msg['result']):
                    result = msg['result']['result']['value']
                    if result == 'login-form-not-found':
                        # we're done
                        return
                    elif result in ('submitted-form', 'maybe-submitted-form'):
                        # wait for page load event below
                        self.logger.info(
                                'submitted a login form, waiting for another '
                                'page load event')
                        break
                    # else try again to get __brzl_tryLoginState

            except BrowsingTimeout:
                pass

            if time.time() - start > 30:
                raise BrowsingException(
                        'timed out trying to check if tryLogin finished')

        # if we get here, we submitted a form, now we wait for another page
        # load event
        self._wait_for(
                lambda: self.websock_thread.got_page_load_event,
                timeout=timeout)
Example #8
0
class Browser:
    '''
    Manages an instance of Chrome for browsing pages.
    '''
    logger = logging.getLogger(__module__ + '.' + __qualname__)

    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websock_url = None
        self.websock = None
        self.websock_thread = None
        self.is_browsing = False
        self._command_id = Counter()
        self._wait_interval = 0.5

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def _wait_for(self, callback, timeout=None):
        '''
        Spins until callback() returns truthy.
        '''
        start = time.time()
        while True:
            if callback():
                return
            elapsed = time.time() - start
            if timeout and elapsed > timeout:
                raise BrowsingTimeout('timed out after %.1fs waiting for: %s' %
                                      (elapsed, callback))
            brozzler.sleep(self._wait_interval)

    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self._command_id)
        kwargs['id'] = msg_id
        msg = json.dumps(kwargs, separators=',:')
        logging.log(logging.TRACE if suppress_logging else logging.DEBUG,
                    'sending message to %s: %s', self.websock, msg)
        self.websock.send(msg)
        return msg_id

    def start(self, **kwargs):
        '''
        Starts chrome if it's not running.

        Args:
            **kwargs: arguments for self.chrome.start(...)
        '''
        if not self.is_running():
            self.websock_url = self.chrome.start(**kwargs)
            self.websock = websocket.WebSocketApp(self.websock_url)
            self.websock_thread = WebsockReceiverThread(
                self.websock, name='WebsockThread:%s' % self.chrome.port)
            self.websock_thread.start()

            self._wait_for(lambda: self.websock_thread.is_open, timeout=30)

            # tell browser to send us messages we're interested in
            self.send_to_chrome(method='Network.enable')
            self.send_to_chrome(method='Page.enable')
            # Enable Console & Runtime output only when debugging.
            # After all, we just print these events with debug(), we don't use
            # them in Brozzler logic.
            if self.logger.isEnabledFor(logging.DEBUG):
                self.send_to_chrome(method='Console.enable')
                self.send_to_chrome(method='Runtime.enable')
            self.send_to_chrome(method='ServiceWorker.enable')
            self.send_to_chrome(
                method='ServiceWorker.setForceUpdateOnPageLoad')

            # disable google analytics and amp analytics
            self.send_to_chrome(
                method='Network.setBlockedURLs',
                params={
                    'urls': [
                        '*google-analytics.com/analytics.js*',
                        '*google-analytics.com/ga.js*',
                        '*google-analytics.com/ga_exp.js*',
                        '*google-analytics.com/urchin.js*',
                        '*google-analytics.com/collect*',
                        '*google-analytics.com/r/collect*',
                        '*google-analytics.com/__utm.gif*',
                        '*google-analytics.com/gtm/js?*',
                        '*google-analytics.com/cx/api.js*',
                        '*cdn.ampproject.org/*/amp-analytics*.js'
                    ]
                })

    def stop(self):
        '''
        Stops chrome if it's running.
        '''
        try:
            if (self.websock and self.websock.sock
                    and self.websock.sock.connected):
                self.logger.info('shutting down websocket connection')
                try:
                    self.websock.close()
                except BaseException as e:
                    self.logger.error('exception closing websocket %s - %s',
                                      self.websock, e)

            self.chrome.stop()

            if self.websock_thread and (self.websock_thread !=
                                        threading.current_thread()):
                self.websock_thread.join(timeout=30)
                if self.websock_thread.is_alive():
                    self.logger.error(
                        '%s still alive 30 seconds after closing %s, will '
                        'forcefully nudge it again', self.websock_thread,
                        self.websock)
                    self.websock.keep_running = False
                    self.websock_thread.join(timeout=30)
                    if self.websock_thread.is_alive():
                        self.logger.critical(
                            '%s still alive 60 seconds after closing %s',
                            self.websock_thread, self.websock)

            self.websock_url = None
        except:
            self.logger.error('problem stopping', exc_info=True)

    def is_running(self):
        return self.websock_url is not None

    def browse_page(self,
                    page_url,
                    extra_headers=None,
                    user_agent=None,
                    behavior_parameters=None,
                    behaviors_dir=None,
                    on_request=None,
                    on_response=None,
                    on_service_worker_version_updated=None,
                    on_screenshot=None,
                    username=None,
                    password=None,
                    hashtags=None,
                    screenshot_full_page=False,
                    skip_extract_outlinks=False,
                    skip_visit_hashtags=False,
                    skip_youtube_dl=False,
                    simpler404=False,
                    page_timeout=300,
                    behavior_timeout=900,
                    extract_outlinks_timeout=60,
                    download_throughput=-1,
                    stealth=False):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_service_worker_version_updated: callback to invoke on every
                ServiceWorker.workerVersionUpdated event, takes one argument,
                the json-decoded message (default None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)
            username: username string to use to try logging in if a login form
                is found in the page (default None)
            password: password string to use to try logging in if a login form
                is found in the page (default None)
            ... (there are more)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
            self.websock_thread.on_service_worker_version_updated = \
                    on_service_worker_version_updated
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(extra_headers=extra_headers,
                                       user_agent=user_agent,
                                       download_throughput=download_throughput,
                                       stealth=stealth)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                # If the target page HTTP status is 4xx/5xx, there is no point
                # in running behaviors, outlink and hashtag extraction as we
                # didn't get a valid page. Screenshot should run because i
                # may be useful to have a picture of the error page.
                # This is only enabled with option `simpler404`.
                run_behaviors = True
                if simpler404 and (self.websock_thread.page_status is None
                                   or self.websock_thread.page_status >= 400):
                    run_behaviors = False

                if run_behaviors and behavior_timeout > 0:
                    behavior_script = brozzler.behavior_script(
                        page_url,
                        behavior_parameters,
                        behaviors_dir=behaviors_dir)
                    self.run_behavior(behavior_script,
                                      timeout=behavior_timeout)
                final_page_url = self.url()
                if on_screenshot:
                    self._try_screenshot(on_screenshot, screenshot_full_page)
                if not run_behaviors or skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks(
                        timeout=extract_outlinks_timeout)
                if run_behaviors and not skip_visit_hashtags:
                    self.visit_hashtags(final_page_url, hashtags, outlinks)
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None

    def _try_screenshot(self, on_screenshot, full_page=False):
        """The browser instance must be scrolled to the top of the page before
        trying to get a screenshot.
        """
        self.send_to_chrome(method='Runtime.evaluate',
                            suppress_logging=True,
                            params={'expression': 'window.scroll(0,0)'})
        for i in range(3):
            try:
                jpeg_bytes = self.screenshot(full_page)
                on_screenshot(jpeg_bytes)
                return
            except BrowsingTimeout as e:
                logging.error('attempt %s/3: %s', i + 1, e)

    def visit_hashtags(self, page_url, hashtags, outlinks):
        _hashtags = set(hashtags or [])
        for outlink in outlinks:
            url = urlcanon.whatwg(outlink)
            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url)
            if hashtag and str(url) == page_url:
                _hashtags.add(hashtag)
        # could inject a script that listens for HashChangeEvent to figure
        # out which hashtags were visited already and skip those
        for hashtag in _hashtags:
            # navigate_to_hashtag (nothing to wait for so no timeout?)
            self.logger.debug('navigating to hashtag %s', hashtag)
            url = urlcanon.whatwg(page_url)
            url.hash_sign = b'#'
            url.fragment = hashtag[1:].encode('utf-8')
            self.send_to_chrome(method='Page.navigate',
                                params={'url': str(url)})
            time.sleep(5)  # um.. wait for idleness or something?
            # take another screenshot?
            # run behavior again with short timeout?
            # retrieve outlinks again and append to list?

    def configure_browser(self,
                          extra_headers=None,
                          user_agent=None,
                          download_throughput=-1,
                          stealth=False):
        headers = extra_headers or {}
        headers['Accept-Encoding'] = 'gzip'  # avoid encodings br, sdch
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(method='Network.setExtraHTTPHeaders',
                                     params={'headers': headers})
        self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                       timeout=10)
        if user_agent:
            msg_id = self.send_to_chrome(method='Network.setUserAgentOverride',
                                         params={'userAgent': user_agent})
        if download_throughput > -1:
            # traffic shaping already used by SPN2 to aid warcprox resilience
            # parameter value as bytes/second, or -1 to disable (default)
            msg_id = self.send_to_chrome(
                method='Network.emulateNetworkConditions',
                params={'downloadThroughput': download_throughput})
        if stealth:
            self.websock_thread.expect_result(self._command_id.peek())
            js = brozzler.jinja2_environment().get_template(
                'stealth.js').render()
            msg_id = self.send_to_chrome(
                method='Page.addScriptToEvaluateOnNewDocument',
                params={'source': js})
            self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                           timeout=10)

    def navigate_to_page(self, page_url, timeout=300):
        self.logger.info('navigating to page %s', page_url)
        self.websock_thread.got_page_load_event = None
        self.websock_thread.page_status = None
        self.send_to_chrome(method='Page.navigate', params={'url': page_url})
        self._wait_for(lambda: self.websock_thread.got_page_load_event,
                       timeout=timeout)

    def extract_outlinks(self, timeout=60):
        self.logger.info('extracting outlinks')
        self.websock_thread.expect_result(self._command_id.peek())
        js = brozzler.jinja2_environment().get_template(
            'extract-outlinks.js').render()
        msg_id = self.send_to_chrome(method='Runtime.evaluate',
                                     params={'expression': js})
        self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                       timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        if ('result' in message and 'result' in message['result']
                and 'value' in message['result']['result']):
            if message['result']['result']['value']:
                out = []
                for link in message['result']['result']['value'].split('\n'):
                    try:
                        out.append(str(urlcanon.whatwg(link)))
                    except AddressValueError:
                        self.logger.warning('skip invalid outlink: %s', link)
                return frozenset(out)
            else:
                # no links found
                return frozenset()
        else:
            self.logger.error(
                'problem extracting outlinks, result message: %s', message)
            return frozenset()

    def screenshot(self, full_page=False, timeout=45):
        """Optionally capture full page screenshot using puppeteer as an
        inspiration:
        https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
        """
        self.logger.info('taking screenshot')
        if full_page:
            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(method='Page.getLayoutMetrics')
            self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                           timeout=timeout)
            message = self.websock_thread.pop_result(msg_id)
            width = message['result']['contentSize']['width']
            height = message['result']['contentSize']['height']
            clip = dict(x=0, y=0, width=width, height=height, scale=1)
            deviceScaleFactor = 1
            screenOrientation = {'angle': 0, 'type': 'portraitPrimary'}
            self.send_to_chrome(method='Emulation.setDeviceMetricsOverride',
                                params=dict(
                                    mobile=False,
                                    width=width,
                                    height=height,
                                    deviceScaleFactor=deviceScaleFactor,
                                    screenOrientation=screenOrientation))
            capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
        else:
            capture_params = {'format': 'jpeg', 'quality': 95}
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(method='Page.captureScreenshot',
                                     params=capture_params)
        self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                       timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        jpeg_bytes = base64.b64decode(message['result']['data'])
        return jpeg_bytes

    def url(self, timeout=30):
        '''
        Returns value of document.URL from the browser.
        '''
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(method='Runtime.evaluate',
                                     params={'expression': 'document.URL'})
        self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                       timeout=timeout)
        message = self.websock_thread.pop_result(msg_id)
        return message['result']['result']['value']

    def run_behavior(self, behavior_script, timeout=900):
        self.send_to_chrome(method='Runtime.evaluate',
                            suppress_logging=True,
                            params={'expression': behavior_script})

        check_interval = min(timeout, 7)
        start = time.time()
        while True:
            elapsed = time.time() - start
            if elapsed > timeout:
                logging.info('behavior reached hard timeout after %.1fs',
                             elapsed)
                return

            brozzler.sleep(check_interval)

            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
                method='Runtime.evaluate',
                suppress_logging=True,
                params={'expression': 'umbraBehaviorFinished()'})
            try:
                self._wait_for(
                    lambda: self.websock_thread.received_result(msg_id),
                    timeout=5)
                msg = self.websock_thread.pop_result(msg_id)
                if (msg and 'result' in msg
                        and not ('exceptionDetails' in msg['result'])
                        and not ('wasThrown' in msg['result']
                                 and msg['result']['wasThrown'])
                        and 'result' in msg['result']
                        and type(msg['result']['result']['value']) == bool
                        and msg['result']['result']['value']):
                    self.logger.info('behavior decided it has finished')
                    return
            except BrowsingTimeout:
                pass

    def try_login(self, username, password, timeout=300):
        try_login_js = brozzler.jinja2_environment().get_template(
            'try-login.js.j2').render(username=username, password=password)

        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(method='Runtime.evaluate',
                            suppress_logging=True,
                            params={'expression': try_login_js})

        # wait for tryLogin to finish trying (should be very very quick)
        start = time.time()
        while True:
            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
                method='Runtime.evaluate',
                params={
                    'expression':
                    'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'
                })
            try:
                self._wait_for(
                    lambda: self.websock_thread.received_result(msg_id),
                    timeout=5)
                msg = self.websock_thread.pop_result(msg_id)
                if (msg and 'result' in msg and 'result' in msg['result']):
                    result = msg['result']['result']['value']
                    if result == 'login-form-not-found':
                        # we're done
                        return
                    elif result in ('submitted-form', 'maybe-submitted-form'):
                        # wait for page load event below
                        self.logger.info(
                            'submitted a login form, waiting for another '
                            'page load event')
                        break
                    # else try again to get __brzl_tryLoginState

            except BrowsingTimeout:
                pass

            if time.time() - start > 30:
                raise BrowsingException(
                    'timed out trying to check if tryLogin finished')

        # if we get here, we submitted a form, now we wait for another page
        # load event
        self._wait_for(lambda: self.websock_thread.got_page_load_event,
                       timeout=timeout)
Example #9
0
class Browser:
    '''
    Manages an instance of Chrome for browsing pages.
    '''
    logger = logging.getLogger(__module__ + '.' + __qualname__)

    def __init__(self, **kwargs):
        '''
        Initializes the Browser.

        Args:
            **kwargs: arguments for Chrome(...)
        '''
        self.chrome = Chrome(**kwargs)
        self.websocket_url = None
        self.is_browsing = False
        self._browser_controller = None

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self, **kwargs):
        '''
        Starts chrome if it's not running.

        Args:
            **kwargs: arguments for self.chrome.start(...)
        '''
        if not self.is_running():
            self.websocket_url = self.chrome.start(**kwargs)
            self._browser_controller = BrowserController(self.websocket_url)
            self._browser_controller.start()

    def stop(self):
        '''
        Stops chrome if it's running.
        '''
        try:
            if self._browser_controller:
                self._browser_controller.stop()
            self.websocket_url = None
            self.chrome.stop()
        except:
            self.logger.error('problem stopping', exc_info=True)

    def is_running(self):
        return self.websocket_url is not None

    def browse_page(
            self, page_url, ignore_cert_errors=False, extra_headers=None,
            user_agent=None, behavior_parameters=None,
            on_request=None, on_response=None, on_screenshot=None):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            BrowsingException: if browsing the page fails
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        try:
            self._browser_controller.navigate_to_page(page_url, timeout=300)
            ## if login_credentials:
            ##     self._browser_controller.try_login(login_credentials) (5 min?)
            behavior_script = brozzler.behavior_script(
                    page_url, behavior_parameters)
            self._browser_controller.run_behavior(behavior_script, timeout=900)
            if on_screenshot:
                self._browser_controller.scroll_to_top()
                jpeg_bytes = self._browser_controller.screenshot()
                on_screenshot(jpeg_bytes)
            outlinks = self._browser_controller.extract_outlinks()
            ## for each hashtag not already visited:
            ##     navigate_to_hashtag (nothing to wait for so no timeout?)
            ##     if on_screenshot;
            ##         take screenshot (30 sec)
            ##     run behavior (3 min)
            ##     outlinks += retrieve_outlinks (60 sec)
            final_page_url = self._browser_controller.url()
            return final_page_url, outlinks
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False