async def process(self, response: ClientResponse, request: Request = None): if 'Content-Type' in response.headers and 'text' not in response.headers[ 'Content-Type']: return text = await response.text() sel = Selector(text) for key in self.input_args['ckeyword']: if key in text: logger.debug(f'Find keyword {key} in html') yield { 'source': response.url.__str__(), 'keyword': key, 'brief': '...' + re.findall('.{,20}%s.{,20}' % key, text)[0] + '...' } for link in sel.xpath('//a/@href').extract(): if link.startswith('javascript') or link.startswith('mailto:'): continue url = Url(urljoin(response.url.__str__(), link)) if url.netloc in self.allow_domain: yield Request(url, headers=self.headers, ext_data=self.input_args)
async def startup(self): querys = { 'type': { 'type': 'const', 'value': '2' }, 'query': { 'type': 'keyword', 'value': ['{wkeyword}'] }, 's_from': 'input', } for keyword in ext.args_string_processing(self.input_args['wkeyword']): nv = self.input_args nv['wkeyword'] = keyword async for url in self.gen_url( 'https://weixin.sogou.com/weixin', querys, nv, 'page', [ page for page in range(self.input_args['wsearch_start'], self.input_args['wsearch_end'], self.input_args['wsearch_step']) ]): # yield None yield Request(url.url_full(encoded=False), headers=self.headers, ext_data=nv)
async def process(self, response: ClientResponse, request: Request = None): text = await response.text() if '请输入验证码' in text: sleep_time = 3600 logger.warning(f'出现验证码,休眠 {sleep_time} s...') await asyncio.sleep(sleep_time) yield Request(response.url.__str__(), headers=self.headers, ext_data=request.ext_data) key_split: List[str] = [] if self.input_args['winclude']: key_split = request.ext_data['gkeyword'].split() rule = { 'box': { 'selector': '//*[@class="txt-box"]', 'child': { 'title': '//a//text()', 'href': '//a/@data-share', 'brief': '//*[@class="txt-info"]//text()', } } } data = await self.parse(text, selector=rule) for it in data['box']: logger.debug(f'Search keyword {key_split} in {it["brief"]} or {it["title"]}') for i in key_split: if self.input_args['binclude'] and i not in it['brief'] and i not in it['title']: yield logger.debug(f'Find keyword {key_split} in {it["brief"]} or {it["title"]}') yield { 'source': response.url.__str__(), 'keyword': key_split, 'href': it['href'], 'title': it['title'], 'brief': it['brief'] }
async def startup(self): self.input_args['curl'] = ext.args_string_processing( self.input_args['curl']) self.input_args['ckeyword'] = set( ext.args_string_processing(self.input_args['ckeyword'])) flag = False if not self.allow_domain: flag = True for url in self.input_args['curl']: if flag: o_url = Url(url) self.allow_domain.add(o_url.netloc) yield Request(url, headers=self.headers, ext_data=self.input_args)
async def startup(self): # reme # use this set start Request('http://example.com') # or use this yield Request('http://example.com')