def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response try: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) if on_screenshot: self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None
def browse_page(self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, download_throughput=-1, stealth=False): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_service_worker_version_updated: callback to invoke on every ServiceWorker.workerVersionUpdated event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) username: username string to use to try logging in if a login form is found in the page (default None) password: password string to use to try logging in if a login form is found in the page (default None) ... (there are more) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: self.websock_thread.on_service_worker_version_updated = \ on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser(extra_headers=extra_headers, user_agent=user_agent, download_throughput=download_throughput, stealth=stealth) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) # If the target page HTTP status is 4xx/5xx, there is no point # in running behaviors, outlink and hashtag extraction as we # didn't get a valid page. Screenshot should run because i # may be useful to have a picture of the error page. # This is only enabled with option `simpler404`. run_behaviors = True if simpler404 and (self.websock_thread.page_status is None or self.websock_thread.page_status >= 400): run_behaviors = False if run_behaviors and behavior_timeout > 0: behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) final_page_url = self.url() if on_screenshot: self._try_screenshot(on_screenshot, screenshot_full_page) if not run_behaviors or skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks( timeout=extract_outlinks_timeout) if run_behaviors and not skip_visit_hashtags: self.visit_hashtags(final_page_url, hashtags, outlinks) return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None
def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) behaviors_dir: Directory containing behaviors.yaml and JS templates (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_service_worker_version_updated: callback to invoke on every ServiceWorker.workerVersionUpdated event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) username: username string to use to try logging in if a login form is found in the page (default None) password: password string to use to try logging in if a login form is found in the page (default None) ... (there are more) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: self.websock_thread.on_service_worker_version_updated = \ on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: self.logger.debug( 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) if on_screenshot: self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False self.websock_thread.on_request = None self.websock_thread.on_response = None
def browse_page( self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, on_request=None, on_response=None, on_screenshot=None): ''' Browses page in browser. Browser should already be running, i.e. start() should have been called. Opens the page_url in the browser, runs behaviors, takes a screenshot, extracts outlinks. Args: page_url: url of the page to browse extra_headers: dict of extra http headers to configure the browser to send with every request (default None) user_agent: user agent string, replaces browser default if supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) Returns: A tuple (final_page_url, outlinks). final_page_url: the url in the location bar at the end of the browse_page cycle, which could be different from the original page url if the page redirects, javascript has changed the url in the location bar, etc outlinks: a list of navigational links extracted from the page Raises: BrowsingException: if browsing the page fails ''' if not self.is_running(): raise BrowsingException('browser has not been started') if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True try: self._browser_controller.navigate_to_page(page_url, timeout=300) ## if login_credentials: ## self._browser_controller.try_login(login_credentials) (5 min?) behavior_script = brozzler.behavior_script( page_url, behavior_parameters) self._browser_controller.run_behavior(behavior_script, timeout=900) if on_screenshot: self._browser_controller.scroll_to_top() jpeg_bytes = self._browser_controller.screenshot() on_screenshot(jpeg_bytes) outlinks = self._browser_controller.extract_outlinks() ## for each hashtag not already visited: ## navigate_to_hashtag (nothing to wait for so no timeout?) ## if on_screenshot; ## take screenshot (30 sec) ## run behavior (3 min) ## outlinks += retrieve_outlinks (60 sec) final_page_url = self._browser_controller.url() return final_page_url, outlinks except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) finally: self.is_browsing = False