def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None self.websock_thread = None self.is_browsing = False self._command_id = Counter()
def start(self, proxy=None, cookie_db=None): if not self._chrome_instance: self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, ignore_cert_errors=self.ignore_cert_errors, proxy=proxy or self.proxy, cookie_db=None) try: self._websocket_url = self._chrome_instance.start() except: self._chrome_instance = None raise
def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websocket_url = None self.is_browsing = False self._browser_controller = None
def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None self.websock_thread = None self.is_browsing = False self._command_id = Counter() self._wait_interval = 0.5
class Browser: """ Runs chrome/chromium to synchronously browse one page at a time using worker.browse_page(). Should not be accessed from multiple threads. """ logger = logging.getLogger(__module__ + "." + __qualname__) HARD_TIMEOUT_SECONDS = 20 * 60 def __init__( self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe self.proxy = proxy self.ignore_cert_errors = ignore_cert_errors self._behavior = None self._websock = None self._abort_browse_page = False self._chrome_instance = None self._aw_snap_hes_dead_jim = None self._work_dir = None self._websocket_url = None def __repr__(self): return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port) def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def start(self, proxy=None, cookie_db=None): if not self._chrome_instance: self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, ignore_cert_errors=self.ignore_cert_errors, proxy=proxy or self.proxy, cookie_db=None) try: self._websocket_url = self._chrome_instance.start() except: self._chrome_instance = None raise def stop(self): try: if self.is_running(): self._chrome_instance.stop() self._chrome_instance = None self._websocket_url = None except: self.logger.error("problem stopping", exc_info=True) def is_running(self): return bool(self._websocket_url) def abort_browse_page(self): self._abort_browse_page = True def persist_and_read_cookie_db(self): if self._chrome_instance: return self._chrome_instance.persist_and_read_cookie_db() else: return None def browse_page( self, url, extra_headers=None, behavior_parameters=None, user_agent=None, on_request=None, on_response=None, on_screenshot=None, on_url_change=None): """ Synchronously loads a page, runs behaviors, and takes a screenshot. Raises BrowsingException if browsing the page fails in a non-critical way. Returns extracted outlinks. """ if not self.is_running(): raise BrowsingException("browser has not been started") self.url = url self.extra_headers = extra_headers self.user_agent = user_agent self.on_request = on_request self.on_screenshot = on_screenshot self.on_url_change = on_url_change self.on_response = on_response self.behavior_parameters = behavior_parameters self._outlinks = None self._reached_limit = None self._aw_snap_hes_dead_jim = None self._abort_browse_page = False self._has_screenshot = False self._waiting_on_result_messages = {} self._result_message_timeout = None self._websock = websocket.WebSocketApp( self._websocket_url, on_open=self._visit_page, on_message=self._wrap_handle_message) thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( self.chrome_port, datetime.datetime.utcnow()) websock_thread = threading.Thread( target=self._websock.run_forever, name=thread_name, kwargs={'ping_timeout':0.5}) websock_thread.start() self._start = time.time() aborted = False try: while True: time.sleep(0.5) if self._browse_interval_func(): return self._outlinks finally: if (self._websock and self._websock.sock and self._websock.sock.connected): try: self._websock.close() except BaseException as e: self.logger.error( "exception closing websocket %s - %s" % ( self._websock, e)) websock_thread.join(timeout=30) if websock_thread.is_alive(): self.logger.error( "%s still alive 30 seconds after closing %s, will " "forcefully nudge it again" % ( websock_thread, self._websock)) self._websock.keep_running = False websock_thread.join(timeout=30) if websock_thread.is_alive(): self.logger.critical( "%s still alive 60 seconds after closing %s" % ( websock_thread, self._websock)) self._behavior = None OUTLINKS_JS = r""" var __brzl_framesDone = new Set(); var __brzl_compileOutlinks = function(frame) { __brzl_framesDone.add(frame); if (frame && frame.document) { var outlinks = Array.prototype.slice.call( frame.document.querySelectorAll('a[href]')); for (var i = 0; i < frame.frames.length; i++) { if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i])); } } } return outlinks; } __brzl_compileOutlinks(window).join('\n'); """ def _chain_chrome_messages(self, chain): """ Sends a series of messages to chrome/chromium on the debugging protocol websocket. Waits for a reply from each one before sending the next. Enforces a timeout waiting for each reply. If the timeout is hit, sets self._result_message_timeout with a ResultMessageTimeout (an exception class). Takes an array of dicts, each of which should look like this: { "info": "human readable description", "chrome_msg": { ... }, # message to send to chrome, as a dict "timeout": 30, # timeout in seconds "callback": my_callback, # takes one arg, the result message } The code is rather convoluted because of the asynchronous nature of the whole thing. See how it's used in _start_postbehavior_chain. """ timer = None def callback(message): if timer: timer.cancel() if "callback" in chain[0]: chain[0]["callback"](message) self._chain_chrome_messages(chain[1:]) def timeout(): self._result_message_timeout = ResultMessageTimeout( "timed out after %.1fs waiting for result message " "for %s", chain[0]["timeout"], chain[0]["chrome_msg"]) if chain: msg_id = self.send_to_chrome(**chain[0]["chrome_msg"]) self._waiting_on_result_messages[msg_id] = callback self.logger.info( "msg_id=%s for message %s", msg_id, chain[0]["chrome_msg"]) timer = threading.Timer(chain[0]["timeout"], timeout) timer.daemon = True timer.start() else: self.logger.info("finished chrome message chain") def _start_postbehavior_chain(self): if self.on_screenshot: chain = [{ "info": "scrolling to top", "chrome_msg": { "method": "Runtime.evaluate", "params": {"expression": "window.scrollTo(0, 0);"}, }, "timeout": 30, "callback": lambda message: None, }, { "info": "requesting screenshot", "chrome_msg": {"method": "Page.captureScreenshot"}, "timeout": 30, "callback": lambda message: ( self.on_screenshot and self.on_screenshot( base64.b64decode(message["result"]["data"]))), }] else: chain = [] def set_outlinks(message): if message["result"]["result"]["value"]: self._outlinks = frozenset( message["result"]["result"]["value"].split("\n")) else: self._outlinks = frozenset() chain.append({ "info": "retrieving outlinks", "chrome_msg": { "method": "Runtime.evaluate", "params": {"expression": self.OUTLINKS_JS}, }, "timeout": 60, "callback": set_outlinks, }) self._chain_chrome_messages(chain) def _browse_interval_func(self): """Called periodically while page is being browsed. Returns True when finished browsing.""" if (not self._websock or not self._websock.sock or not self._websock.sock.connected): raise BrowsingException( "websocket closed, did chrome die? {}".format( self._websocket_url)) elif self._result_message_timeout: raise self._result_message_timeout elif self._aw_snap_hes_dead_jim: raise BrowsingException( """chrome tab went "aw snap" or "he's dead jim"!""") elif self._outlinks is not None: # setting self._outlinks is the last thing that happens in the # post-behavior chain return True elif (self._behavior != None and self._behavior.is_finished() or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS): if self._behavior and self._behavior.is_finished(): self.logger.info( "behavior decided it's finished with %s", self.url) else: self.logger.info( "reached hard timeout of %s seconds url=%s", Browser.HARD_TIMEOUT_SECONDS, self.url) self._behavior = None self._start_postbehavior_chain() return False elif self._reached_limit: raise self._reached_limit elif self._abort_browse_page: raise BrowsingAborted("browsing page aborted") else: return False def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self.command_id) kwargs["id"] = msg_id msg = json.dumps(kwargs) if not suppress_logging: self.logger.debug("sending message to %s: %s", self._websock, msg) self._websock.send(msg) return msg_id def _visit_page(self, websock): # navigate to about:blank here to avoid situation where we navigate to # the same page that we're currently on, perhaps with a different # #fragment, which prevents Page.loadEventFired from happening self.send_to_chrome(method="Page.navigate", params={"url": "about:blank"}) self.send_to_chrome(method="Network.enable") self.send_to_chrome(method="Page.enable") self.send_to_chrome(method="Console.enable") self.send_to_chrome(method="Debugger.enable") self.send_to_chrome(method="Runtime.enable") headers = self.extra_headers or {} headers['Accept-Encoding'] = 'identity' self.send_to_chrome( method="Network.setExtraHTTPHeaders", params={"headers":headers}) if self.user_agent: self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent}) # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused" self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"}) # navigate to the page! self.send_to_chrome(method="Page.navigate", params={"url": self.url}) def _wrap_handle_message(self, websock, message): try: self._handle_message(websock, message) except: self.logger.error( "uncaught exception in _handle_message message=%s", message, exc_info=True) self.abort_browse_page() def _network_request_will_be_sent(self, message): if self._behavior: self._behavior.notify_of_activity() if message["params"]["request"]["url"].lower().startswith("data:"): self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80])) elif self.on_request: self.on_request(message) def _network_response_received(self, message): if (not self._reached_limit and message["params"]["response"]["status"] == 420 and "Warcprox-Meta" in CaseInsensitiveDict( message["params"]["response"]["headers"])): warcprox_meta = json.loads(CaseInsensitiveDict( message["params"]["response"]["headers"])["Warcprox-Meta"]) self._reached_limit = brozzler.ReachedLimit( warcprox_meta=warcprox_meta) self.logger.info("reached limit %s", self._reached_limit) if self.on_response: self.on_response(message) def _page_load_event_fired(self, message): def page_url_after_load_event(message): if message["result"]["result"]["value"] != self.url: if self.on_url_change: self.on_url_change(message["result"]["result"]["value"]) msg_id = self.send_to_chrome( method="Runtime.evaluate", params={"expression":"document.URL"}) self._waiting_on_result_messages[msg_id] = page_url_after_load_event self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url)) self._behavior = Behavior(self.url, self) self._behavior.start(self.behavior_parameters) def _console_message_added(self, message): self.logger.debug("%s console.%s %s", self._websock.url, message["params"]["message"]["level"], message["params"]["message"]["text"]) def _debugger_paused(self, message): # We hit the breakpoint set in visit_page. Get rid of google # analytics script! self.logger.debug("debugger paused! message={}".format(message)) scriptId = message['params']['callFrames'][0]['location']['scriptId'] # replace script self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"}) # resume execution self.send_to_chrome(method="Debugger.resume") def _handle_message(self, websock, json_message): message = json.loads(json_message) if "method" in message: if message["method"] == "Network.requestWillBeSent": self._network_request_will_be_sent(message) elif message["method"] == "Network.responseReceived": self._network_response_received(message) elif message["method"] == "Page.loadEventFired": self._page_load_event_fired(message) elif message["method"] == "Console.messageAdded": self._console_message_added(message) elif message["method"] == "Debugger.paused": self._debugger_paused(message) elif message["method"] == "Inspector.targetCrashed": self._aw_snap_hes_dead_jim = message # elif message["method"] in ( # "Network.dataReceived", "Network.responseReceived", # "Network.loadingFinished"): # pass # else: # self.logger.debug("%s %s", message["method"], json_message) elif "result" in message: if message["id"] in self._waiting_on_result_messages: callback = self._waiting_on_result_messages[message["id"]] del self._waiting_on_result_messages[message["id"]] self.logger.debug( "received result for message id=%s, calling %s", message["id"], callback) callback(message) elif self._behavior and self._behavior.is_waiting_on_result( message["id"]): self._behavior.notify_of_result(message)
class Browser: ''' Manages an instance of Chrome for browsing pages. ''' logger = logging.getLogger(__module__ + '.' + __qualname__) def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None self.websock_thread = None self.is_browsing = False self._command_id = Counter() self._wait_interval = 0.5 def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def _wait_for(self, callback, timeout=None): ''' Spins until callback() returns truthy. ''' start = time.time() while True: if callback(): return elapsed = time.time() - start if timeout and elapsed > timeout: raise BrowsingTimeout( 'timed out after %.1fs waiting for: %s' % ( elapsed, callback)) brozzler.sleep(self._wait_interval) def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self._command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs, separators=',:') logging.log( logging.TRACE if suppress_logging else logging.DEBUG, 'sending message to %s: %s', self.websock, msg) self.websock.send(msg) return msg_id def start(self, **kwargs): ''' Starts chrome if it's not running. Args: **kwargs: arguments for self.chrome.start(...) ''' if not self.is_running(): self.websock_url = self.chrome.start(**kwargs) self.websock = websocket.WebSocketApp(self.websock_url) self.websock_thread = WebsockReceiverThread( self.websock, name='WebsockThread:%s' % self.chrome.port) self.websock_thread.start() self._wait_for(lambda: self.websock_thread.is_open, timeout=30) # tell browser to send us messages we're interested in self.send_to_chrome(method='Network.enable') self.send_to_chrome(method='Page.enable') self.send_to_chrome(method='Console.enable') self.send_to_chrome(method='Runtime.enable') # disable google analytics self.send_to_chrome( method='Network.setBlockedURLs', params={'urls': ['*google-analytics.com/analytics.js', '*google-analytics.com/ga.js']} ) def stop(self): ''' Stops chrome if it's running. ''' try: if (self.websock and self.websock.sock and self.websock.sock.connected): self.logger.info('shutting down websocket connection') try: self.websock.close() except BaseException as e: self.logger.error( 'exception closing websocket %s - %s', self.websock, e) self.chrome.stop() if self.websock_thread and ( self.websock_thread != threading.current_thread()): self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.error( '%s still alive 30 seconds after closing %s, will ' 'forcefully nudge it again', self.websock_thread, self.websock) self.websock.keep_running = False self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.critical( '%s still alive 60 seconds after closing %s', self.websock_thread, self.websock) self.websock_url = None except: self.logger.error('problem stopping', exc_info=True) def is_running(self): return self.websock_url is not None def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response try: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) if on_screenshot: self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None def _try_screenshot(self, on_screenshot): for i in range(3): try: jpeg_bytes = self.screenshot() on_screenshot(jpeg_bytes) return except BrowsingTimeout as e: logging.error('attempt %s/3: %s', i+1, e) def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) for outlink in outlinks: url = urlcanon.whatwg(outlink) hashtag = (url.hash_sign + url.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url) if hashtag and str(url) == page_url: _hashtags.add(hashtag) # could inject a script that listens for HashChangeEvent to figure # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) self.logger.debug('navigating to hashtag %s', hashtag) url = urlcanon.whatwg(page_url) url.hash_sign = b'#' url.fragment = hashtag[1:].encode('utf-8') self.send_to_chrome( method='Page.navigate', params={'url': str(url)}) time.sleep(5) # um.. wait for idleness or something? # take another screenshot? # run behavior again with short timeout? # retrieve outlinks again and append to list? def configure_browser(self, extra_headers=None, user_agent=None): headers = extra_headers or {} headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Network.setExtraHTTPHeaders', params={'headers': headers}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=10) if user_agent: msg_id = self.send_to_chrome( method='Network.setUserAgentOverride', params={'userAgent': user_agent}) def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) self.websock_thread.got_page_load_event = None self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self._wait_for( lambda: self.websock_thread.got_page_load_event, timeout=timeout) def extract_outlinks(self, timeout=60): self.logger.info('extracting outlinks') self.websock_thread.expect_result(self._command_id.peek()) js = brozzler.jinja2_environment().get_template( 'extract-outlinks.js').render() msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': js}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) if ('result' in message and 'result' in message['result'] and 'value' in message['result']['result']): if message['result']['result']['value']: return frozenset( message['result']['result']['value'].split('\n')) else: # no links found return frozenset() else: self.logger.error( 'problem extracting outlinks, result message: %s', message) return frozenset() def screenshot(self, timeout=45): self.logger.info('taking screenshot') self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Page.captureScreenshot') self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) jpeg_bytes = base64.b64decode(message['result']['data']) return jpeg_bytes def url(self, timeout=30): ''' Returns value of document.URL from the browser. ''' self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': 'document.URL'}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) return message['result']['result']['value'] def run_behavior(self, behavior_script, timeout=900): self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': behavior_script}) start = time.time() while True: elapsed = time.time() - start if elapsed > timeout: logging.info( 'behavior reached hard timeout after %.1fs', elapsed) return brozzler.sleep(7) self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': 'umbraBehaviorFinished()'}) try: self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=5) msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and not ('exceptionDetails' in msg['result']) and not ('wasThrown' in msg['result'] and msg['result']['wasThrown']) and 'result' in msg['result'] and type(msg['result']['result']['value']) == bool and msg['result']['result']['value']): self.logger.info('behavior decided it has finished') return except BrowsingTimeout: pass def try_login(self, username, password, timeout=300): try_login_js = brozzler.jinja2_environment().get_template( 'try-login.js.j2').render(username=username, password=password) self.websock_thread.got_page_load_event = None self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': try_login_js}) # wait for tryLogin to finish trying (should be very very quick) start = time.time() while True: self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'}) try: self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=5) msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and 'result' in msg['result']): result = msg['result']['result']['value'] if result == 'login-form-not-found': # we're done return elif result in ('submitted-form', 'maybe-submitted-form'): # wait for page load event below self.logger.info( 'submitted a login form, waiting for another ' 'page load event') break # else try again to get __brzl_tryLoginState except BrowsingTimeout: pass if time.time() - start > 30: raise BrowsingException( 'timed out trying to check if tryLogin finished') # if we get here, we submitted a form, now we wait for another page # load event self._wait_for( lambda: self.websock_thread.got_page_load_event, timeout=timeout)
class Browser: ''' Manages an instance of Chrome for browsing pages. ''' logger = logging.getLogger(__module__ + '.' + __qualname__) def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None self.websock_thread = None self.is_browsing = False self._command_id = Counter() self._wait_interval = 0.5 def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def _wait_for(self, callback, timeout=None): ''' Spins until callback() returns truthy. ''' start = time.time() while True: if callback(): return elapsed = time.time() - start if timeout and elapsed > timeout: raise BrowsingTimeout( 'timed out after %.1fs waiting for: %s' % ( elapsed, callback)) brozzler.sleep(self._wait_interval) def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self._command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs, separators=',:') logging.log( logging.TRACE if suppress_logging else logging.DEBUG, 'sending message to %s: %s', self.websock, msg) self.websock.send(msg) return msg_id def start(self, **kwargs): ''' Starts chrome if it's not running. Args: **kwargs: arguments for self.chrome.start(...) ''' if not self.is_running(): self.websock_url = self.chrome.start(**kwargs) self.websock = websocket.WebSocketApp(self.websock_url) self.websock_thread = WebsockReceiverThread( self.websock, name='WebsockThread:%s' % self.chrome.port) self.websock_thread.start() self._wait_for(lambda: self.websock_thread.is_open, timeout=30) # tell browser to send us messages we're interested in self.send_to_chrome(method='Network.enable') self.send_to_chrome(method='Page.enable') self.send_to_chrome(method='Console.enable') self.send_to_chrome(method='Runtime.enable') self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') # disable google analytics self.send_to_chrome( method='Network.setBlockedURLs', params={'urls': ['*google-analytics.com/analytics.js', '*google-analytics.com/ga.js']}) def stop(self): ''' Stops chrome if it's running. ''' try: if (self.websock and self.websock.sock and self.websock.sock.connected): self.logger.info('shutting down websocket connection') try: self.websock.close() except BaseException as e: self.logger.error( 'exception closing websocket %s - %s', self.websock, e) self.chrome.stop() if self.websock_thread and ( self.websock_thread != threading.current_thread()): self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.error( '%s still alive 30 seconds after closing %s, will ' 'forcefully nudge it again', self.websock_thread, self.websock) self.websock.keep_running = False self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.critical( '%s still alive 60 seconds after closing %s', self.websock_thread, self.websock) self.websock_url = None except: self.logger.error('problem stopping', exc_info=True) def is_running(self): return self.websock_url is not None def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_service_worker_version_updated: callback to invoke on every ServiceWorker.workerVersionUpdated event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) username: username string to use to try logging in if a login form is found in the page (default None) password: password string to use to try logging in if a login form is found in the page (default None) ... (there are more) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: self.websock_thread.on_service_worker_version_updated = \ on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) if on_screenshot: self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None def _try_screenshot(self, on_screenshot): for i in range(3): try: jpeg_bytes = self.screenshot() on_screenshot(jpeg_bytes) return except BrowsingTimeout as e: logging.error('attempt %s/3: %s', i+1, e) def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) for outlink in outlinks: url = urlcanon.whatwg(outlink) hashtag = (url.hash_sign + url.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url) if hashtag and str(url) == page_url: _hashtags.add(hashtag) # could inject a script that listens for HashChangeEvent to figure # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) self.logger.debug('navigating to hashtag %s', hashtag) url = urlcanon.whatwg(page_url) url.hash_sign = b'#' url.fragment = hashtag[1:].encode('utf-8') self.send_to_chrome( method='Page.navigate', params={'url': str(url)}) time.sleep(5) # um.. wait for idleness or something? # take another screenshot? # run behavior again with short timeout? # retrieve outlinks again and append to list? def configure_browser(self, extra_headers=None, user_agent=None): headers = extra_headers or {} headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Network.setExtraHTTPHeaders', params={'headers': headers}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=10) if user_agent: msg_id = self.send_to_chrome( method='Network.setUserAgentOverride', params={'userAgent': user_agent}) def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) self.websock_thread.got_page_load_event = None self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self._wait_for( lambda: self.websock_thread.got_page_load_event, timeout=timeout) def extract_outlinks(self, timeout=60): self.logger.info('extracting outlinks') self.websock_thread.expect_result(self._command_id.peek()) js = brozzler.jinja2_environment().get_template( 'extract-outlinks.js').render() msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': js}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) if ('result' in message and 'result' in message['result'] and 'value' in message['result']['result']): if message['result']['result']['value']: return frozenset( message['result']['result']['value'].split('\n')) else: # no links found return frozenset() else: self.logger.error( 'problem extracting outlinks, result message: %s', message) return frozenset() def screenshot(self, timeout=45): self.logger.info('taking screenshot') self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Page.captureScreenshot') self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) jpeg_bytes = base64.b64decode(message['result']['data']) return jpeg_bytes def url(self, timeout=30): ''' Returns value of document.URL from the browser. ''' self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': 'document.URL'}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) return message['result']['result']['value'] def run_behavior(self, behavior_script, timeout=900): self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': behavior_script}) start = time.time() while True: elapsed = time.time() - start if elapsed > timeout: logging.info( 'behavior reached hard timeout after %.1fs', elapsed) return brozzler.sleep(7) self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': 'umbraBehaviorFinished()'}) try: self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=5) msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and not ('exceptionDetails' in msg['result']) and not ('wasThrown' in msg['result'] and msg['result']['wasThrown']) and 'result' in msg['result'] and type(msg['result']['result']['value']) == bool and msg['result']['result']['value']): self.logger.info('behavior decided it has finished') return except BrowsingTimeout: pass def try_login(self, username, password, timeout=300): try_login_js = brozzler.jinja2_environment().get_template( 'try-login.js.j2').render(username=username, password=password) self.websock_thread.got_page_load_event = None self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': try_login_js}) # wait for tryLogin to finish trying (should be very very quick) start = time.time() while True: self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'}) try: self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=5) msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and 'result' in msg['result']): result = msg['result']['result']['value'] if result == 'login-form-not-found': # we're done return elif result in ('submitted-form', 'maybe-submitted-form'): # wait for page load event below self.logger.info( 'submitted a login form, waiting for another ' 'page load event') break # else try again to get __brzl_tryLoginState except BrowsingTimeout: pass if time.time() - start > 30: raise BrowsingException( 'timed out trying to check if tryLogin finished') # if we get here, we submitted a form, now we wait for another page # load event self._wait_for( lambda: self.websock_thread.got_page_load_event, timeout=timeout)
class Browser: ''' Manages an instance of Chrome for browsing pages. ''' logger = logging.getLogger(__module__ + '.' + __qualname__) def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None self.websock_thread = None self.is_browsing = False self._command_id = Counter() self._wait_interval = 0.5 def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def _wait_for(self, callback, timeout=None): ''' Spins until callback() returns truthy. ''' start = time.time() while True: if callback(): return elapsed = time.time() - start if timeout and elapsed > timeout: raise BrowsingTimeout('timed out after %.1fs waiting for: %s' % (elapsed, callback)) brozzler.sleep(self._wait_interval) def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self._command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs, separators=',:') logging.log(logging.TRACE if suppress_logging else logging.DEBUG, 'sending message to %s: %s', self.websock, msg) self.websock.send(msg) return msg_id def start(self, **kwargs): ''' Starts chrome if it's not running. Args: **kwargs: arguments for self.chrome.start(...) ''' if not self.is_running(): self.websock_url = self.chrome.start(**kwargs) self.websock = websocket.WebSocketApp(self.websock_url) self.websock_thread = WebsockReceiverThread( self.websock, name='WebsockThread:%s' % self.chrome.port) self.websock_thread.start() self._wait_for(lambda: self.websock_thread.is_open, timeout=30) # tell browser to send us messages we're interested in self.send_to_chrome(method='Network.enable') self.send_to_chrome(method='Page.enable') # Enable Console & Runtime output only when debugging. # After all, we just print these events with debug(), we don't use # them in Brozzler logic. if self.logger.isEnabledFor(logging.DEBUG): self.send_to_chrome(method='Console.enable') self.send_to_chrome(method='Runtime.enable') self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome( method='ServiceWorker.setForceUpdateOnPageLoad') # disable google analytics and amp analytics self.send_to_chrome( method='Network.setBlockedURLs', params={ 'urls': [ '*google-analytics.com/analytics.js*', '*google-analytics.com/ga.js*', '*google-analytics.com/ga_exp.js*', '*google-analytics.com/urchin.js*', '*google-analytics.com/collect*', '*google-analytics.com/r/collect*', '*google-analytics.com/__utm.gif*', '*google-analytics.com/gtm/js?*', '*google-analytics.com/cx/api.js*', '*cdn.ampproject.org/*/amp-analytics*.js' ] }) def stop(self): ''' Stops chrome if it's running. ''' try: if (self.websock and self.websock.sock and self.websock.sock.connected): self.logger.info('shutting down websocket connection') try: self.websock.close() except BaseException as e: self.logger.error('exception closing websocket %s - %s', self.websock, e) self.chrome.stop() if self.websock_thread and (self.websock_thread != threading.current_thread()): self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.error( '%s still alive 30 seconds after closing %s, will ' 'forcefully nudge it again', self.websock_thread, self.websock) self.websock.keep_running = False self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.critical( '%s still alive 60 seconds after closing %s', self.websock_thread, self.websock) self.websock_url = None except: self.logger.error('problem stopping', exc_info=True) def is_running(self): return self.websock_url is not None def browse_page(self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, download_throughput=-1, stealth=False): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_service_worker_version_updated: callback to invoke on every ServiceWorker.workerVersionUpdated event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) username: username string to use to try logging in if a login form is found in the page (default None) password: password string to use to try logging in if a login form is found in the page (default None) ... (there are more) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: self.websock_thread.on_service_worker_version_updated = \ on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser(extra_headers=extra_headers, user_agent=user_agent, download_throughput=download_throughput, stealth=stealth) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) # If the target page HTTP status is 4xx/5xx, there is no point # in running behaviors, outlink and hashtag extraction as we # didn't get a valid page. Screenshot should run because i # may be useful to have a picture of the error page. # This is only enabled with option `simpler404`. run_behaviors = True if simpler404 and (self.websock_thread.page_status is None or self.websock_thread.page_status >= 400): run_behaviors = False if run_behaviors and behavior_timeout > 0: behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) final_page_url = self.url() if on_screenshot: self._try_screenshot(on_screenshot, screenshot_full_page) if not run_behaviors or skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks( timeout=extract_outlinks_timeout) if run_behaviors and not skip_visit_hashtags: self.visit_hashtags(final_page_url, hashtags, outlinks) return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None def _try_screenshot(self, on_screenshot, full_page=False): """The browser instance must be scrolled to the top of the page before trying to get a screenshot. """ self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True, params={'expression': 'window.scroll(0,0)'}) for i in range(3): try: jpeg_bytes = self.screenshot(full_page) on_screenshot(jpeg_bytes) return except BrowsingTimeout as e: logging.error('attempt %s/3: %s', i + 1, e) def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) for outlink in outlinks: url = urlcanon.whatwg(outlink) hashtag = (url.hash_sign + url.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url) if hashtag and str(url) == page_url: _hashtags.add(hashtag) # could inject a script that listens for HashChangeEvent to figure # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) self.logger.debug('navigating to hashtag %s', hashtag) url = urlcanon.whatwg(page_url) url.hash_sign = b'#' url.fragment = hashtag[1:].encode('utf-8') self.send_to_chrome(method='Page.navigate', params={'url': str(url)}) time.sleep(5) # um.. wait for idleness or something? # take another screenshot? # run behavior again with short timeout? # retrieve outlinks again and append to list? def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1, stealth=False): headers = extra_headers or {} headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Network.setExtraHTTPHeaders', params={'headers': headers}) self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10) if user_agent: msg_id = self.send_to_chrome(method='Network.setUserAgentOverride', params={'userAgent': user_agent}) if download_throughput > -1: # traffic shaping already used by SPN2 to aid warcprox resilience # parameter value as bytes/second, or -1 to disable (default) msg_id = self.send_to_chrome( method='Network.emulateNetworkConditions', params={'downloadThroughput': download_throughput}) if stealth: self.websock_thread.expect_result(self._command_id.peek()) js = brozzler.jinja2_environment().get_template( 'stealth.js').render() msg_id = self.send_to_chrome( method='Page.addScriptToEvaluateOnNewDocument', params={'source': js}) self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10) def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) self.websock_thread.got_page_load_event = None self.websock_thread.page_status = None self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout) def extract_outlinks(self, timeout=60): self.logger.info('extracting outlinks') self.websock_thread.expect_result(self._command_id.peek()) js = brozzler.jinja2_environment().get_template( 'extract-outlinks.js').render() msg_id = self.send_to_chrome(method='Runtime.evaluate', params={'expression': js}) self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) if ('result' in message and 'result' in message['result'] and 'value' in message['result']['result']): if message['result']['result']['value']: out = [] for link in message['result']['result']['value'].split('\n'): try: out.append(str(urlcanon.whatwg(link))) except AddressValueError: self.logger.warning('skip invalid outlink: %s', link) return frozenset(out) else: # no links found return frozenset() else: self.logger.error( 'problem extracting outlinks, result message: %s', message) return frozenset() def screenshot(self, full_page=False, timeout=45): """Optionally capture full page screenshot using puppeteer as an inspiration: https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898 """ self.logger.info('taking screenshot') if full_page: self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Page.getLayoutMetrics') self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) width = message['result']['contentSize']['width'] height = message['result']['contentSize']['height'] clip = dict(x=0, y=0, width=width, height=height, scale=1) deviceScaleFactor = 1 screenOrientation = {'angle': 0, 'type': 'portraitPrimary'} self.send_to_chrome(method='Emulation.setDeviceMetricsOverride', params=dict( mobile=False, width=width, height=height, deviceScaleFactor=deviceScaleFactor, screenOrientation=screenOrientation)) capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip} else: capture_params = {'format': 'jpeg', 'quality': 95} self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Page.captureScreenshot', params=capture_params) self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) jpeg_bytes = base64.b64decode(message['result']['data']) return jpeg_bytes def url(self, timeout=30): ''' Returns value of document.URL from the browser. ''' self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Runtime.evaluate', params={'expression': 'document.URL'}) self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) return message['result']['result']['value'] def run_behavior(self, behavior_script, timeout=900): self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True, params={'expression': behavior_script}) check_interval = min(timeout, 7) start = time.time() while True: elapsed = time.time() - start if elapsed > timeout: logging.info('behavior reached hard timeout after %.1fs', elapsed) return brozzler.sleep(check_interval) self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': 'umbraBehaviorFinished()'}) try: self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=5) msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and not ('exceptionDetails' in msg['result']) and not ('wasThrown' in msg['result'] and msg['result']['wasThrown']) and 'result' in msg['result'] and type(msg['result']['result']['value']) == bool and msg['result']['result']['value']): self.logger.info('behavior decided it has finished') return except BrowsingTimeout: pass def try_login(self, username, password, timeout=300): try_login_js = brozzler.jinja2_environment().get_template( 'try-login.js.j2').render(username=username, password=password) self.websock_thread.got_page_load_event = None self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True, params={'expression': try_login_js}) # wait for tryLogin to finish trying (should be very very quick) start = time.time() while True: self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={ 'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }' }) try: self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=5) msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and 'result' in msg['result']): result = msg['result']['result']['value'] if result == 'login-form-not-found': # we're done return elif result in ('submitted-form', 'maybe-submitted-form'): # wait for page load event below self.logger.info( 'submitted a login form, waiting for another ' 'page load event') break # else try again to get __brzl_tryLoginState except BrowsingTimeout: pass if time.time() - start > 30: raise BrowsingException( 'timed out trying to check if tryLogin finished') # if we get here, we submitted a form, now we wait for another page # load event self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
class Browser: ''' Manages an instance of Chrome for browsing pages. ''' logger = logging.getLogger(__module__ + '.' + __qualname__) def __init__(self, **kwargs): ''' Initializes the Browser. Args: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) self.websocket_url = None self.is_browsing = False self._browser_controller = None def __enter__(self): self.start() return self def __exit__(self, *args): self.stop() def start(self, **kwargs): ''' Starts chrome if it's not running. Args: **kwargs: arguments for self.chrome.start(...) ''' if not self.is_running(): self.websocket_url = self.chrome.start(**kwargs) self._browser_controller = BrowserController(self.websocket_url) self._browser_controller.start() def stop(self): ''' Stops chrome if it's running. ''' try: if self._browser_controller: self._browser_controller.stop() self.websocket_url = None self.chrome.stop() except: self.logger.error('problem stopping', exc_info=True) def is_running(self): return self.websocket_url is not None def browse_page( self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, on_request=None, on_response=None, on_screenshot=None): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: BrowsingException: if browsing the page fails ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True try: self._browser_controller.navigate_to_page(page_url, timeout=300) ## if login_credentials: ## self._browser_controller.try_login(login_credentials) (5 min?) behavior_script = brozzler.behavior_script( page_url, behavior_parameters) self._browser_controller.run_behavior(behavior_script, timeout=900) if on_screenshot: self._browser_controller.scroll_to_top() jpeg_bytes = self._browser_controller.screenshot() on_screenshot(jpeg_bytes) outlinks = self._browser_controller.extract_outlinks() ## for each hashtag not already visited: ## navigate_to_hashtag (nothing to wait for so no timeout?) ## if on_screenshot; ## take screenshot (30 sec) ## run behavior (3 min) ## outlinks += retrieve_outlinks (60 sec) final_page_url = self._browser_controller.url() return final_page_url, outlinks except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False