Esempio n. 1
0
 def __on_connection_close(self) -> None:
     """Find browser(s) with closed websocket connection and replace it."""
     logger.info("Checking closed connections.")
     for browser in set(self.browsers.keys()):
         if browser._connection.connection is None or not browser._connection.connection.open:
             logger.warning(f"Found closed connection: {browser}")
             asyncio.create_task(self.replace_browser(browser))
Esempio n. 2
0
 async def replace_browser(self,
                           browser: Browser,
                           launch_options: Dict[str, Any] = None) -> None:
     """Close browser and launch a new one."""
     # check if this browser has already been replaced.
     if browser not in self.browsers:
         logger.debug(f'Browser {browser} has already been replaced.')
         return
     lock = self.browsers[browser]['lock']
     # check if another task is currently replacing this browser.
     if lock.locked():
         logger.debug(
             f'Waiting for browser {browser} replacement to finish.')
         # wait for new browser launch to finish.
         while lock.locked():
             await asyncio.sleep(0.5)
         # return now that browser replacement is complete.
         return
     # lock this browser so other tasks can not create replacement browsers for this browser.
     async with lock:
         logger.info(f"Replacing browser: {browser}.")
         browser_data = self.browsers[browser]
         # update launch options if new options are provided.
         if launch_options:
             browser_data['launch_options'].update(launch_options)
         # close the old browser.
         await self._shutdown_browser(browser)
         # add a new browser.
         await self.add_browser(
             pages=browser_data['page_count'],
             server=browser_data['server'],
             launch_options=browser_data['launch_options'])
     logger.info(f"Browser {browser} replacement complete.")
Esempio n. 3
0
 async def cancel_spider_tasks(self):
     """Cancel all of Spider's tasks."""
     tasks = [t for t in asyncio.all_tasks(
     ) if t is not asyncio.current_task() and 'coro=<Spider.' in str(t)]
     [t.cancel() for t in tasks]
     logger.info(f"Cancelling {len(tasks)} outstanding tasks.")
     return await asyncio.gather(*tasks, return_exceptions=True)
Esempio n. 4
0
    async def get(self,
                  url: str,
                  retries: int = 2,
                  **kwargs) -> Tuple[Response, Page]:
        """Navigate next idle page to url."""
        async def _get(url: str, page: Page, **kwargs) -> Response:
            """All page functions that will hang on page crash go here."""
            if 'cookies' in kwargs:
                # set request cookies if provided.
                await self._set_cookies(page, kwargs.pop('cookies'))
            # all kwargs besides 'cookies' should be for goto
            resp = await page.goto(url, **kwargs)
            if self.browsers[page.browser]['launch_options'].get(
                    'screenshot', False):
                # save screenshot of page.
                await self._take_screenshot(page)
            return resp

        async def _retry_get(url: str, retries: int, **kwargs):
            """Retry navigation if there are remaining retries."""
            retries -= 1
            if retries >= 0:
                logger.warning(
                    f"Retrying request to {url}. Retries remaining: {retries}")
                return await asyncio.create_task(
                    self.get(url, retries, **kwargs))
            logger.error(
                f"Max retries exceeded: {url}. URL can not be navigated.")

        # get next page from idle queue.
        page = await self._get_idle_page()
        browser_data = self.browsers[page.browser]
        timeout = kwargs.get('timeout',
                             self._default_nav_func_wait(browser_data))
        try:
            resp = await asyncio.wait_for(_get(url, page, **kwargs),
                                          timeout=timeout)
        except asyncio.TimeoutError:
            # timeout suggests browser crash.
            logger.warning(
                f"Detected browser crash {page.browser} (get timeout exceeded {timeout})"
            )
            await self.replace_browser(page.browser)
            return await _retry_get(url, retries, **kwargs)
        except Exception as e:
            logger.exception(f"Error fetching page {url}: {e}")
            # record that there was an error while navigating page.
            await self._log_browser_error_status(page.browser, True)
            # add the page back to idle page queue.
            await self.set_idle(page)
            return await _retry_get(url, retries, **kwargs)
        # record that page was navigated with no error.
        await self._log_browser_error_status(page.browser, False)
        status = resp.status if resp else None
        logger.info(
            f"[{status}] (server - {browser_data['server']}, browser - {browser_data['id']}, page - {self.pages[page]['id']}): {page.url}"
        )
        return resp, page
Esempio n. 5
0
 async def shutdown(self, sig=None) -> None:
     """Shutdown all browsers."""
     if sig is not None:
         logger.info(f"Caught signal: {sig.name}")
     logger.info("Shutting down...")
     await self.cancel_spider_tasks()
     # close all browsers on all servers.
     await asyncio.gather(*[
         asyncio.create_task(self._shutdown_browser(b))
         for b in set(self.browsers.keys())
     ])
Esempio n. 6
0
 async def _close_page(self, page: Page) -> None:
     """Close page and remove all references."""
     logger.info(f"Removing page: {page}")
     if page in self.idle_page_q._queue:
         # remove page from idle queue.
         self.idle_page_q._queue.remove(page)
     del self.pages[page]
     try:
         # wait for page to close.
         await asyncio.wait_for(page.close(), timeout=2)
     except asyncio.TimeoutError:
         logger.warning(f"Page {page} could not be properly closed.")
Esempio n. 7
0
 async def _shutdown_browser(self, browser: Browser) -> None:
     """Close browser and remove all references."""
     logger.info(f"Removing browser: {browser}")
     # remove all pages from the browser.
     for page in await browser.pages():
         await self._close_page(page)
     # disable self.__on_connection_close
     browser._connection._closeCallback = None
     # attempt to properly close browser.
     try:
         await asyncio.wait_for(browser.close(), timeout=2)
     except asyncio.TimeoutError:
         pass
     del self.browsers[browser]
Esempio n. 8
0
 async def _launch_remote_browser(self,
                                  server_ip,
                                  launch_options: Dict[str, Any] = None
                                  ) -> Browser:
     """Initialize a Browser inastance and connect to the DevTools endpoint of a browser running on machine at {server_ip}."""
     logger.info(
         f"Launching remote browser on {server_ip}:\n{pformat(launch_options)}"
     )
     endpoint = f"http://{server_ip}/new_browser"
     resp = requests.get(endpoint, json=launch_options)
     if resp.status_code != 200:
         logger.error(
             f"Could not add browser ({resp.status_code}): {endpoint}")
         return
     logger.info(f"[{resp.status_code}] Added Browser on {server_ip}")
     # construct DevTools endpoint.
     dev_tools_endpoint = resp.json()['dev_tools'].replace(
         '127.0.0.1',
         server_ip.split(':')[0])
     logger.info(f"Connecting to {server_ip} browser: {dev_tools_endpoint}")
     # connect to new browser's DevTools endpoint.
     browser = await pyppeteer.launcher.connect(
         browserWSEndpoint=dev_tools_endpoint)
     logger.info(f"Connected to browser {dev_tools_endpoint}: {browser}")
     return browser
Esempio n. 9
0
 async def _launch_local_browser(self,
                                 launch_options: Dict[str, Any] = None
                                 ) -> Browser:
     """Launch a new browser on local machine."""
     logger.info(f"Launching local browser:\n{pformat(launch_options)}")
     return await pyppeteer.launcher.launch(launch_options)