def delay_context_exit(): gate = brozzler.thread_accept_exceptions() orig_exit = type(gate).__exit__ try: type(gate).__exit__ = lambda self, et, ev, t: ( brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1] with brozzler.thread_accept_exceptions() as gate: brozzler.sleep(2) except Exception as e: nonlocal thread_caught_exception thread_caught_exception = e finally: type(gate).__exit__ = orig_exit
def two_with_blocks(): try: with brozzler.thread_accept_exceptions(): time.sleep(2) return # test fails except Exception1 as e: pass except: return # fail test try: with brozzler.thread_accept_exceptions(): brozzler.sleep(2) except Exception as e: nonlocal thread_caught_exception thread_caught_exception = e
def accept_immediately(): try: with brozzler.thread_accept_exceptions(): brozzler.sleep(2) except Exception as e: nonlocal thread_caught_exception thread_caught_exception = e
def accept_eventually(): try: brozzler.sleep(2) with brozzler.thread_accept_exceptions(): pass except Exception as e: nonlocal thread_caught_exception thread_caught_exception = e
def _try_youtube_dl(self, ydl, site, page): try: self.logger.info("trying youtube-dl on {}".format(page)) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test info = ydl.extract_info(str(urlcanon.whatwg(page.url))) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) if self._using_warcprox(site): info_json = json.dumps(info, sort_keys=True, indent=4) self.logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type= "application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) except brozzler.ShutdownRequested as e: raise except BaseException as e: if hasattr( e, "exc_info" ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and self._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying yt-dlp on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'yt-dlp hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying youtube-dl on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response try: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) if on_screenshot: self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None
def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_service_worker_version_updated: callback to invoke on every ServiceWorker.workerVersionUpdated event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) username: username string to use to try logging in if a login form is found in the page (default None) password: password string to use to try logging in if a login form is found in the page (default None) ... (there are more) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: self.websock_thread.on_service_worker_version_updated = \ on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) if on_screenshot: self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None
def browse_page(self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, download_throughput=-1, stealth=False): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_service_worker_version_updated: callback to invoke on every ServiceWorker.workerVersionUpdated event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) username: username string to use to try logging in if a login form is found in the page (default None) password: password string to use to try logging in if a login form is found in the page (default None) ... (there are more) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: self.websock_thread.on_service_worker_version_updated = \ on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser(extra_headers=extra_headers, user_agent=user_agent, download_throughput=download_throughput, stealth=stealth) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) # If the target page HTTP status is 4xx/5xx, there is no point # in running behaviors, outlink and hashtag extraction as we # didn't get a valid page. Screenshot should run because i # may be useful to have a picture of the error page. # This is only enabled with option `simpler404`. run_behaviors = True if simpler404 and (self.websock_thread.page_status is None or self.websock_thread.page_status >= 400): run_behaviors = False if run_behaviors and behavior_timeout > 0: behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) final_page_url = self.url() if on_screenshot: self._try_screenshot(on_screenshot, screenshot_full_page) if not run_behaviors or skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks( timeout=extract_outlinks_timeout) if run_behaviors and not skip_visit_hashtags: self.visit_hashtags(final_page_url, hashtags, outlinks) return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None