Beispiel #1
0
 async def process(self, response: ClientResponse, request: Request = None):
     if 'Content-Type' in response.headers and 'text' not in response.headers[
             'Content-Type']:
         return
     text = await response.text()
     sel = Selector(text)
     for key in self.input_args['ckeyword']:
         if key in text:
             logger.debug(f'Find keyword {key} in html')
             yield {
                 'source':
                 response.url.__str__(),
                 'keyword':
                 key,
                 'brief':
                 '...' + re.findall('.{,20}%s.{,20}' % key, text)[0] + '...'
             }
     for link in sel.xpath('//a/@href').extract():
         if link.startswith('javascript') or link.startswith('mailto:'):
             continue
         url = Url(urljoin(response.url.__str__(), link))
         if url.netloc in self.allow_domain:
             yield Request(url,
                           headers=self.headers,
                           ext_data=self.input_args)
Beispiel #2
0
    async def startup(self):
        querys = {
            'type': {
                'type': 'const',
                'value': '2'
            },
            'query': {
                'type': 'keyword',
                'value': ['{wkeyword}']
            },
            's_from': 'input',
        }

        for keyword in ext.args_string_processing(self.input_args['wkeyword']):
            nv = self.input_args
            nv['wkeyword'] = keyword
            async for url in self.gen_url(
                    'https://weixin.sogou.com/weixin', querys, nv, 'page', [
                        page
                        for page in range(self.input_args['wsearch_start'],
                                          self.input_args['wsearch_end'],
                                          self.input_args['wsearch_step'])
                    ]):
                # yield None
                yield Request(url.url_full(encoded=False),
                              headers=self.headers,
                              ext_data=nv)
Beispiel #3
0
 async def process(self, response: ClientResponse, request: Request = None):
     text = await response.text()
     if '请输入验证码' in text:
         sleep_time = 3600
         logger.warning(f'出现验证码,休眠 {sleep_time} s...')
         await asyncio.sleep(sleep_time)
         yield Request(response.url.__str__(), headers=self.headers, ext_data=request.ext_data)
     key_split: List[str] = []
     if self.input_args['winclude']:
         key_split = request.ext_data['gkeyword'].split()
     rule = {
         'box': {
             'selector': '//*[@class="txt-box"]',
             'child': {
                 'title': '//a//text()',
                 'href': '//a/@data-share',
                 'brief': '//*[@class="txt-info"]//text()', }
         }
     }
     data = await self.parse(text, selector=rule)
     for it in data['box']:
         logger.debug(f'Search keyword {key_split} in {it["brief"]} or {it["title"]}')
         for i in key_split:
             if self.input_args['binclude'] and i not in it['brief'] and i not in it['title']:
                 yield
         logger.debug(f'Find keyword {key_split} in {it["brief"]} or {it["title"]}')
         yield {
             'source': response.url.__str__(),
             'keyword': key_split,
             'href': it['href'],
             'title': it['title'],
             'brief': it['brief']
         }
Beispiel #4
0
    async def startup(self):
        self.input_args['curl'] = ext.args_string_processing(
            self.input_args['curl'])
        self.input_args['ckeyword'] = set(
            ext.args_string_processing(self.input_args['ckeyword']))

        flag = False
        if not self.allow_domain:
            flag = True
        for url in self.input_args['curl']:
            if flag:
                o_url = Url(url)
                self.allow_domain.add(o_url.netloc)
            yield Request(url, headers=self.headers, ext_data=self.input_args)
Beispiel #5
0
 async def startup(self):
     # reme
     # use this set start
     Request('http://example.com')
     # or use this
     yield Request('http://example.com')