async def _download_request_page(self, request: Request, spider: Spider, page: Page) -> Response: self.stats.inc_value("pyppeteer/page_count") if self.navigation_timeout is not None: page.setDefaultNavigationTimeout(self.navigation_timeout) await page.setRequestInterception(True) page.on( "request", partial(_request_handler, scrapy_request=request, stats=self.stats)) page.on("response", partial(_response_handler, stats=self.stats)) start_time = time() response = await page.goto(request.url) page_coroutines = request.meta.get("pyppeteer_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) # set PageCoroutine timeout if self.page_coroutine_timeout is not None and not pc.kwargs.get( "timeout", None): pc.kwargs["timeout"] = self.page_coroutine_timeout if isinstance(pc, NavigationPageCoroutine): await asyncio.gather(page.waitForNavigation(), method(*pc.args, **pc.kwargs)) else: pc.result = await method(*pc.args, **pc.kwargs) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time callback = request.callback or spider.parse annotations = getattr(callback, "__annotations__", {}) for key, value in annotations.items(): if value is pyppeteer.page.Page: request.cb_kwargs[key] = page self.stats.inc_value("pyppeteer/page_count/injected_callback") break else: await page.close() self.stats.inc_value("pyppeteer/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["pyppeteer"], )
async def __add_page_settings(self, page: Page) -> None: """Add custom settings to page.""" # Change the default maximum navigation timeout. if self.default_nav_timeout: page.setDefaultNavigationTimeout(self.default_nav_timeout) tasks = [] # Blocks URLs from loading. if self.blocked_urls: self.logger.info(f"Adding {len(self.blocked_urls)} blocked urls") tasks.append( page._client.send('Network.setBlockedURLs', { 'urls': self.blocked_urls, })) # Disable cache for each request. if self.disable_cache: self.logger.info("Setting cache disabled.") tasks.append(page.setCacheEnabled(False)) # Add a JavaScript function(s) that will be invoked whenever the page is navigated. if self.js_injection_scripts: self.logger.info( f"Adding {len(self.js_injection_scripts)} JavaScript injection scripts" ) for script in self.js_injection_scripts: tasks.append(page.evaluateOnNewDocument(script)) # Add a JavaScript functions to prevent automation detection. for f in Path(__file__).parent.joinpath('automation_detection').glob( "*.js"): self.logger.info( f"(page {page}) Adding automation detection prevention script: {f.name}" ) tasks.append(page.evaluateOnNewDocument(f.read_text())) # Add JavaScript functions to prevent detection of headless mode. if self.headless: for f in Path(__file__).parent.joinpath('headless_detection').glob( "*.js"): self.logger.info( f"(page {page}) Adding headless detection prevention script: {f.name}" ) tasks.append(page.evaluateOnNewDocument(f.read_text())) # Intercept all request and only allow requests for types not in self.request_abort_types. if self.request_abort_types: self.logger.info( f"Setting request interception for {self.request_abort_types}") tasks.append(page.setRequestInterception(True)) async def block_type(request): if request.resourceType in self.request_abort_types: await request.abort() else: await request.continue_() page.on('request', lambda request: asyncio.create_task(block_type(request))) await asyncio.gather(*tasks)
async def _add_page_settings(self, page: Page) -> None: """Add custom settings to a page.""" # launch options for this page. launch_options = self.browsers[page.browser]['launch_options'] # set the default maximum navigation time. if 'defaultNavigationTimeout' in launch_options: page.setDefaultNavigationTimeout( launch_options['defaultNavigationTimeout']) tasks = [self.set_stealth(page)] # blocks URLs from loading. if 'blockedURLs' in launch_options: tasks.append( self.set_blocked_urls(page, launch_options['blockedURLs'])) # disable cache for each request. if 'setCacheEnabled' in launch_options: tasks.append( page.setCacheEnabled(launch_options['setCacheEnabled'])) # add a JavaScript function(s) that will be invoked whenever the page is navigated. for script in launch_options.get('evaluateOnNewDocument', []): tasks.append(page.evaluateOnNewDocument(script)) # intercept all request and only allow requests for types not in request_abort_types. request_abort_types = launch_options.get('requestAbortTypes') if request_abort_types: # enable request interception. tasks.append(page.setRequestInterception(True)) async def block_type(request: Request): # condition(s) where requests should be aborted. if request.resourceType in request_abort_types: await request.abort() elif launch_options.get( 'blockRedirects', False) and request.isNavigationRequest() and len( request.redirectChain): await request.abort() else: await request.continue_() page.on('request', lambda request: asyncio.create_task(block_type(request))) await asyncio.gather(*tasks)
async def _add_page_settings(self, page: Page) -> None: """Add custom settings to a page.""" # add JavaScript functions to prevent automation detection. tasks = [ page.evaluateOnNewDocument( f"() => {{{Path(__file__).parent.joinpath('stealth.min.js').read_text()}}}" ) ] # launch options for this page. launch_options = self.browsers[page.browser]['launch_options'] # set the default maximum navigation time. if 'defaultNavigationTimeout' in launch_options: page.setDefaultNavigationTimeout( launch_options['defaultNavigationTimeout']) # blocks URLs from loading. if 'blockedURLs' in launch_options: await page._client.send('Network.setBlockedURLs', {'urls': launch_options['blockedURLs']}) # disable cache for each request. if 'setCacheEnabled' in launch_options: tasks.append( page.setCacheEnabled(launch_options['setCacheEnabled'])) # add a JavaScript function(s) that will be invoked whenever the page is navigated. for script in launch_options.get('evaluateOnNewDocument', []): tasks.append(page.evaluateOnNewDocument(script)) # intercept all request and only allow requests for types not in request_abort_types. request_abort_types = launch_options.get('requestAbortTypes') if request_abort_types: tasks.append(page.setRequestInterception(True)) async def block_type(request): if request.resourceType in request_abort_types: await request.abort() else: await request.continue_() page.on('request', lambda request: asyncio.create_task(block_type(request))) await asyncio.gather(*tasks)