async def parse_wordstat_page(page: Page) -> Tuple[list, list]: phrases_div, assoc_div, info_query = await asyncio.gather( page.waitForSelector('div.b-word-statistics__including-phrases', {'visible': True}), page.waitForSelector('div.b-word-statistics__phrases-associations', {'visible': True}), page.waitForSelector( 'div.b-word-statistics__including-phrases .b-word-statistics__info-wrapper', {'visible': True}), # block Что искали со словом «"!как !объединить !ячейки !в !ворде"» — 548 показов в месяц ) PARSE_WORDSTAT_TABLE_F = ''' rows => rows.map(row => { const j_row = $(row); const query = j_row.find('a.b-phrase-link__link').text(); const count = j_row.find('td.b-word-statistics__td-number').text().replace(/\xa0/gi, ''); return [query, count]; }) ''' phrases, assocs = await asyncio.gather( phrases_div.JJeval('tr + tr', PARSE_WORDSTAT_TABLE_F), assoc_div.JJeval('tr + tr', PARSE_WORDSTAT_TABLE_F)) print('phrases_div - {}'.format(phrases_div)) print(phrases, assocs) print('phrases - {}'.format(phrases)) print('assocs - {}'.format(assocs)) # info_text = await page.evaluate('(element) => element.textContent', info_query) # try: # # # # # import re # print('content - {}'.format(content)) # p = re.compile('Что искали со словом (.+) — ([0-9 ]+) пока.+') # not simple symbol space ( ) # m = p.match(content) # print(m.group()) # # 'ab' # print(m.group(0)) # # 'ab' # exact_str = m.group(2) # print(exact_str) # exact_count = exact_str.replace(" ", "") # not simple symbol space ( ) # # print('exact_count - {} {}'.format(exact_count, exact_str)) # # print('content - {}'.format(content)) # exact_query = m.group(1) # # exact = [exact_query, exact_count] # await asyncio.sleep(200) return phrases, assocs
async def _download_request_with_page(self, request: Request, spider: Spider, page: Page) -> Response: start_time = time() response = await page.goto(request.url) page_coroutines = request.meta.get("pyppeteer_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) if self.page_coroutine_timeout is not None and not pc.kwargs.get( "timeout", None): pc.kwargs["timeout"] = self.page_coroutine_timeout if isinstance(pc, NavigationPageCoroutine): await asyncio.gather(page.waitForNavigation(), method(*pc.args, **pc.kwargs)) else: pc.result = await method(*pc.args, **pc.kwargs) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time callback = request.callback or spider.parse annotations = getattr(callback, "__annotations__", {}) for key, value in annotations.items(): if value is pyppeteer.page.Page: request.cb_kwargs[key] = page self.stats.inc_value("pyppeteer/page_count/injected_callback") break else: if not page.isClosed(): await page.close() self.stats.inc_value("pyppeteer/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["pyppeteer"], )
async def __add_page_settings(self, page: Page) -> None: """Add custom settings to page.""" # Change the default maximum navigation timeout. if self.default_nav_timeout: page.setDefaultNavigationTimeout(self.default_nav_timeout) tasks = [] # Blocks URLs from loading. if self.blocked_urls: self.logger.info(f"Adding {len(self.blocked_urls)} blocked urls") tasks.append( page._client.send('Network.setBlockedURLs', { 'urls': self.blocked_urls, })) # Disable cache for each request. if self.disable_cache: self.logger.info("Setting cache disabled.") tasks.append(page.setCacheEnabled(False)) # Add a JavaScript function(s) that will be invoked whenever the page is navigated. if self.js_injection_scripts: self.logger.info( f"Adding {len(self.js_injection_scripts)} JavaScript injection scripts" ) for script in self.js_injection_scripts: tasks.append(page.evaluateOnNewDocument(script)) # Add a JavaScript functions to prevent automation detection. for f in Path(__file__).parent.joinpath('automation_detection').glob( "*.js"): self.logger.info( f"(page {page}) Adding automation detection prevention script: {f.name}" ) tasks.append(page.evaluateOnNewDocument(f.read_text())) # Add JavaScript functions to prevent detection of headless mode. if self.headless: for f in Path(__file__).parent.joinpath('headless_detection').glob( "*.js"): self.logger.info( f"(page {page}) Adding headless detection prevention script: {f.name}" ) tasks.append(page.evaluateOnNewDocument(f.read_text())) # Intercept all request and only allow requests for types not in self.request_abort_types. if self.request_abort_types: self.logger.info( f"Setting request interception for {self.request_abort_types}") tasks.append(page.setRequestInterception(True)) async def block_type(request): if request.resourceType in self.request_abort_types: await request.abort() else: await request.continue_() page.on('request', lambda request: asyncio.create_task(block_type(request))) await asyncio.gather(*tasks)
async def _set_cookies(self, page: Page, cookies: Union[List[Dict[str, str]], Dict[str, str]]) -> None: """Add cookies to page.""" if isinstance(cookies, dict): await page.setCookie(cookies) elif isinstance(cookies, (list, tuple, set)): await asyncio.gather( *[page.setCookie(cookie) for cookie in cookies])
async def _close_page(self, page: Page) -> None: logger.info(f"Removing page: {page}") if page in self.idle_page_q._queue: # remove page from idle queue. self.idle_page_q._queue.remove(page) del self.pages[page] try: # wait for page to close. await asyncio.wait_for(page.close(), timeout=2) except asyncio.TimeoutError: logger.warning(f"Page {page} could not be properly closed.")
async def __try_load_contact_by_number(page: Page, target: str) -> bool: try: if int(target): __logger.debug("Loading contact by number.") page.on( 'dialog', lambda dialog: asyncio.ensure_future(__accept_dialog(dialog)) ) await load_website(page, f"{websites['wpp_unknown']}{target}") time.sleep(2) if (await page.evaluate(f'document.querySelector("{whatsapp_selectors_dict["invalid_number_ok_button"]}") != null')): await page.click(whatsapp_selectors_dict["invalid_number_ok_button"]) __logger.debug(f"Invalid number: {target}") print(f"Invalid Number: {target}") return False return True except Exception as e: __logger.error(f"Error loading contact by number: {str(e)}") return False return False
async def load_cookies(self, page: Page): if os.path.exists(self.cookie_path): with open(self.cookie_path, mode="r") as f: cookies = json.load(f) tasks = [ asyncio.create_task(page.setCookie(c)) for c in cookies if c["name"] not in self.IGNORE_COOKIE_NAMES ] if len(tasks) > 0: await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED) LOG.info(f"Cookies for {self.password_manager.username} loaded.") else: LOG.info( f"Cookies for {self.password_manager.username} not yet existing." )
def toService(page: Page): toCe(page) then(page.frames[1].click( "#pageContent > div > div.row > div:nth-child(2) > a")) then(page.waitFor(3000)) then(page.frames[1].click( "#pageContent > div > div.row > div > div > div.tabbable.tabbable-tabdrop > ul > li:nth-child(2) > a" )) then(page.frames[1].click( "#allApplyInfo > div > div:nth-child(1) > label:nth-child(1) > div > span > span.selection > span" )) then(page.frames[1].type( "body > span > span > span.select2-search.select2-search--dropdown > input", "zys-pay")) then(page.frames[1].click( "#select2-select2-button-addons-single-input-group-sm-results > li:contains('zys-pay')" ))
async def _add_page_settings(self, page: Page) -> None: """Add custom settings to a page.""" # add JavaScript functions to prevent automation detection. tasks = [ page.evaluateOnNewDocument( f"() => {{{Path(__file__).parent.joinpath('stealth.min.js').read_text()}}}" ) ] # launch options for this page. launch_options = self.browsers[page.browser]['launch_options'] # set the default maximum navigation time. if 'defaultNavigationTimeout' in launch_options: page.setDefaultNavigationTimeout( launch_options['defaultNavigationTimeout']) # blocks URLs from loading. if 'blockedURLs' in launch_options: await page._client.send('Network.setBlockedURLs', {'urls': launch_options['blockedURLs']}) # disable cache for each request. if 'setCacheEnabled' in launch_options: tasks.append( page.setCacheEnabled(launch_options['setCacheEnabled'])) # add a JavaScript function(s) that will be invoked whenever the page is navigated. for script in launch_options.get('evaluateOnNewDocument', []): tasks.append(page.evaluateOnNewDocument(script)) # intercept all request and only allow requests for types not in request_abort_types. request_abort_types = launch_options.get('requestAbortTypes') if request_abort_types: tasks.append(page.setRequestInterception(True)) async def block_type(request): if request.resourceType in request_abort_types: await request.abort() else: await request.continue_() page.on('request', lambda request: asyncio.create_task(block_type(request))) await asyncio.gather(*tasks)
async def _add_page_settings(self, page: Page) -> None: """Add custom settings to a page.""" # launch options for this page. launch_options = self.browsers[page.browser]['launch_options'] # set the default maximum navigation time. if 'defaultNavigationTimeout' in launch_options: page.setDefaultNavigationTimeout( launch_options['defaultNavigationTimeout']) tasks = [self.set_stealth(page)] # blocks URLs from loading. if 'blockedURLs' in launch_options: tasks.append( self.set_blocked_urls(page, launch_options['blockedURLs'])) # disable cache for each request. if 'setCacheEnabled' in launch_options: tasks.append( page.setCacheEnabled(launch_options['setCacheEnabled'])) # add a JavaScript function(s) that will be invoked whenever the page is navigated. for script in launch_options.get('evaluateOnNewDocument', []): tasks.append(page.evaluateOnNewDocument(script)) # intercept all request and only allow requests for types not in request_abort_types. request_abort_types = launch_options.get('requestAbortTypes') if request_abort_types: # enable request interception. tasks.append(page.setRequestInterception(True)) async def block_type(request: Request): # condition(s) where requests should be aborted. if request.resourceType in request_abort_types: await request.abort() elif launch_options.get( 'blockRedirects', False) and request.isNavigationRequest() and len( request.redirectChain): await request.abort() else: await request.continue_() page.on('request', lambda request: asyncio.create_task(block_type(request))) await asyncio.gather(*tasks)
def _add_page_listeners(self, page: Page) -> None: syncer.sync(self._page.setRequestInterception(True)) page.on('request', self._on_request) page.on('response', self._on_response)
async def close_page(self, page: Page) -> None: """Attempt to close a page.""" try: await asyncio.wait_for(page.close(), timeout=2) except Exception: self.logger.warning(f"Page {page} could not be properly closed.")
def toCe(page: Page): toIdc(page) then(page.goto("http://cloudengine.yunzong.me:10470"))
def toIdc(page: Page): then(page.goto("http://idcenter.box.zonghengke.com")) if len(then(page.JJ("#in_user_Nm"))) > 0: then(page.type("#in_user_Nm", "gaowenbo")) then(page.type("#in_password", "YKUacrVjlfoR")) then(page.click("#sign_in"))
def toIdc(driver: page.Page): driver.goto("https://www.baidu.com")