def __on_connection_close(self) -> None: """Find browser(s) with closed websocket connection and replace it.""" logger.info("Checking closed connections.") for browser in set(self.browsers.keys()): if browser._connection.connection is None or not browser._connection.connection.open: logger.warning(f"Found closed connection: {browser}") asyncio.create_task(self.replace_browser(browser))
async def replace_browser(self, browser: Browser, launch_options: Dict[str, Any] = None) -> None: """Close browser and launch a new one.""" # check if this browser has already been replaced. if browser not in self.browsers: logger.debug(f'Browser {browser} has already been replaced.') return lock = self.browsers[browser]['lock'] # check if another task is currently replacing this browser. if lock.locked(): logger.debug( f'Waiting for browser {browser} replacement to finish.') # wait for new browser launch to finish. while lock.locked(): await asyncio.sleep(0.5) # return now that browser replacement is complete. return # lock this browser so other tasks can not create replacement browsers for this browser. async with lock: logger.info(f"Replacing browser: {browser}.") browser_data = self.browsers[browser] # update launch options if new options are provided. if launch_options: browser_data['launch_options'].update(launch_options) # close the old browser. await self._shutdown_browser(browser) # add a new browser. await self.add_browser( pages=browser_data['page_count'], server=browser_data['server'], launch_options=browser_data['launch_options']) logger.info(f"Browser {browser} replacement complete.")
async def cancel_spider_tasks(self): """Cancel all of Spider's tasks.""" tasks = [t for t in asyncio.all_tasks( ) if t is not asyncio.current_task() and 'coro=<Spider.' in str(t)] [t.cancel() for t in tasks] logger.info(f"Cancelling {len(tasks)} outstanding tasks.") return await asyncio.gather(*tasks, return_exceptions=True)
async def get(self, url: str, retries: int = 2, **kwargs) -> Tuple[Response, Page]: """Navigate next idle page to url.""" async def _get(url: str, page: Page, **kwargs) -> Response: """All page functions that will hang on page crash go here.""" if 'cookies' in kwargs: # set request cookies if provided. await self._set_cookies(page, kwargs.pop('cookies')) # all kwargs besides 'cookies' should be for goto resp = await page.goto(url, **kwargs) if self.browsers[page.browser]['launch_options'].get( 'screenshot', False): # save screenshot of page. await self._take_screenshot(page) return resp async def _retry_get(url: str, retries: int, **kwargs): """Retry navigation if there are remaining retries.""" retries -= 1 if retries >= 0: logger.warning( f"Retrying request to {url}. Retries remaining: {retries}") return await asyncio.create_task( self.get(url, retries, **kwargs)) logger.error( f"Max retries exceeded: {url}. URL can not be navigated.") # get next page from idle queue. page = await self._get_idle_page() browser_data = self.browsers[page.browser] timeout = kwargs.get('timeout', self._default_nav_func_wait(browser_data)) try: resp = await asyncio.wait_for(_get(url, page, **kwargs), timeout=timeout) except asyncio.TimeoutError: # timeout suggests browser crash. logger.warning( f"Detected browser crash {page.browser} (get timeout exceeded {timeout})" ) await self.replace_browser(page.browser) return await _retry_get(url, retries, **kwargs) except Exception as e: logger.exception(f"Error fetching page {url}: {e}") # record that there was an error while navigating page. await self._log_browser_error_status(page.browser, True) # add the page back to idle page queue. await self.set_idle(page) return await _retry_get(url, retries, **kwargs) # record that page was navigated with no error. await self._log_browser_error_status(page.browser, False) status = resp.status if resp else None logger.info( f"[{status}] (server - {browser_data['server']}, browser - {browser_data['id']}, page - {self.pages[page]['id']}): {page.url}" ) return resp, page
async def shutdown(self, sig=None) -> None: """Shutdown all browsers.""" if sig is not None: logger.info(f"Caught signal: {sig.name}") logger.info("Shutting down...") await self.cancel_spider_tasks() # close all browsers on all servers. await asyncio.gather(*[ asyncio.create_task(self._shutdown_browser(b)) for b in set(self.browsers.keys()) ])
async def _close_page(self, page: Page) -> None: """Close page and remove all references.""" logger.info(f"Removing page: {page}") if page in self.idle_page_q._queue: # remove page from idle queue. self.idle_page_q._queue.remove(page) del self.pages[page] try: # wait for page to close. await asyncio.wait_for(page.close(), timeout=2) except asyncio.TimeoutError: logger.warning(f"Page {page} could not be properly closed.")
async def _shutdown_browser(self, browser: Browser) -> None: """Close browser and remove all references.""" logger.info(f"Removing browser: {browser}") # remove all pages from the browser. for page in await browser.pages(): await self._close_page(page) # disable self.__on_connection_close browser._connection._closeCallback = None # attempt to properly close browser. try: await asyncio.wait_for(browser.close(), timeout=2) except asyncio.TimeoutError: pass del self.browsers[browser]
async def _launch_remote_browser(self, server_ip, launch_options: Dict[str, Any] = None ) -> Browser: """Initialize a Browser inastance and connect to the DevTools endpoint of a browser running on machine at {server_ip}.""" logger.info( f"Launching remote browser on {server_ip}:\n{pformat(launch_options)}" ) endpoint = f"http://{server_ip}/new_browser" resp = requests.get(endpoint, json=launch_options) if resp.status_code != 200: logger.error( f"Could not add browser ({resp.status_code}): {endpoint}") return logger.info(f"[{resp.status_code}] Added Browser on {server_ip}") # construct DevTools endpoint. dev_tools_endpoint = resp.json()['dev_tools'].replace( '127.0.0.1', server_ip.split(':')[0]) logger.info(f"Connecting to {server_ip} browser: {dev_tools_endpoint}") # connect to new browser's DevTools endpoint. browser = await pyppeteer.launcher.connect( browserWSEndpoint=dev_tools_endpoint) logger.info(f"Connected to browser {dev_tools_endpoint}: {browser}") return browser
async def _launch_local_browser(self, launch_options: Dict[str, Any] = None ) -> Browser: """Launch a new browser on local machine.""" logger.info(f"Launching local browser:\n{pformat(launch_options)}") return await pyppeteer.launcher.launch(launch_options)