Esempio n. 1
0
 def item_parse(self, _configs: list, response, response1=None) -> dict:
     '''
     @parsma _configs->字段抓取设置  list
     @params response->Response
     @output -->result  字段-值 的字典
     '''
     if hasattr(response,'url'):
         response1 = response
     for configs in _configs:
         response_change = self.change_response_f_type(configs, response)
         if configs['list']['v']:
             _response_copy = S.select_content(response_change,
                                               configs['list'], response1) or []
         else:
             if isinstance(response_change, list):
                 _response_copy = response_change
             else:
                 _response_copy = [response_change]
         for _response in _response_copy:
             if not _response:
                 return
             result = dict()
             for config in configs['data']:
                 result[config['En']] = S.select_content(
                     _response, config, response1)
                 result[config['En']] = S.replace_all(result[config['En']])
             item = self.item_db_parse(configs, result)
             if item:
                 # 持久化记录item
                 self.state['items_count'] = self.state.get(
                     'items_count', 0) + 1
                 yield item
Esempio n. 2
0
 def parse(self, response):
     request = checkTimeError(response)
     if request:
         yield request 
         return False
     followConfigs = choice(response.url,urlconfigs) 
     try:
         urls = S.select_content(response,followConfigs,response)
         for url in urls:
             url = response.urljoin(url)
             headers['Referer'] = url
             yield scrapy.Request(url,
                                  headers=headers,
                                  callback = self.infoParse)
     except Exception as e:
         print(response.text,e)
     nextPageConfigs = choice(response.url,pageConfigs) 
     try:
         urls = S.select_content(response,nextPageConfigs,response)
         for url in urls:
             print(url)
             url = response.urljoin(url)
             headers['Referer'] = url
             yield scrapy.Request(url,
                                  headers = headers)
     except Exception as e:
         print(response.text,e)
Esempio n. 3
0
def getUrl2(response, config, formats=None, formats2=None):
    urls = S.select_content(response, config, response)
    if urls:
        urlList = []
        for url in urls:
            if formats:
                urlList.append(formats.format(*url))
            else:
                urlList.append(formats2 % url)
    return urlList
Esempio n. 4
0
 def configParse(self, configs, _response, response=None):
     item = SzseItem()
     if isinstance(configs, dict):
         configs = [configs]
     for _configs in configs:
         #迭代可能多个的configs
         if _configs.get('flag') is None:
             _configs['flag'] = True
         if _configs['list']['v'] and _configs['flag']:
             res = S.select_content(_response, _configs['list'])
         elif isinstance(_response, list):
             res = _response
         else:
             #list(response)   ----让response可迭代
             res = [_response]
         if res:
             for _res in res:
                 #初始化result
                 result = dict()
                 #遍历每个字段提取
                 for config in _configs['data']:
                     k = config['En']
                     result[k] = S.select_content(_res, config, response)
                     result[k] = S.replace_invalid_char(result[k])
                     result[k] = S.replace_invalid_html_char(result[k])
                 if _configs.get('clear'):
                     for config in _configs['clear']:
                         k = config['En']
                         result[k] = S.select_content(
                             result[k], config, response)
                 item['result'] = result
                 item['keys'] = _configs['list']['keys']
                 item['db'] = _configs['list']['db']
                 item['conn'] = _configs['list'].get('conn')
                 #传递pipelin处理item字段
                 if result[_configs['list']['check']]:
                     yield item
Esempio n. 5
0
 def infoParse(self, response):
     item = CyzoneItem()
     request = checkTimeError(response)
     if request:
         yield request
         return None
     item = CyzoneItem()
     configs = self.configChance(response.url)
     result = dict()
     for config in configs['data']:
         k = config['En']
         result[k] = S.select_content(response, config, response)
         result[k] = S.replace_all(result[k])
     item['result'] = result
     item['keys'] = configs['list']['keys']
     item['db'] = configs['list']['db']
     if result[configs['list']['check']]:
         yield item
Esempio n. 6
0
 def scrapy_info_url_help(self,
                      response: Response,
                      config: dict = None,
                      callback: callable = None,
                      errback=None,
                      headers: dict = None,
                      urlfunc: callable = None,
                      bodyfunc: callable = None,
                      divmod: int = 1,
                      meta=None,
                      priority=100,
                      redis_flag=False,
                      redis_conn=None,
                      dont_filter=False,
                      response_type: 'xpath' or 'json' = 'xpath',
                      method: 'GET' or 'POST' = 'GET',
                      flag=False,  # True为下一页翻页,False为生成所有页面
                      pagestart=1,  # 其实页说明
                      connect_type: 'urlencode'
                      or 'json' = 'urlencode') -> scrapy.Request:
     '''
     @ params response  parse的response形参
     @ params config  获取total方法参数  调用S.select_content
     @ callback  回调函数
     @ headers 默认为urlencode
     @ urlfunc  常用lambda函数 
     @ connect_type 决定body的encode方法
     @ response_type 决定参数获取方式
     @ method Request method
     @ divmod 获取到total 后计算totalpage的除数
     @ bodyfunc 常用lambda表达式
     return [Requests]
     '''
     dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps
     if response_type.lower() == 'json':
         try:
             JS_response = json.loads(response.text)
         except:
             JS_response = execjs.eval(response.text)
     else:
         JS_response = response
     reqs = set()
     urls = S.select_content(JS_response, config, response)
     if isinstance(urls, list):
         pass
     else:
         urls = [urls]
     for page in urls:
         if not page:
             return []
         if callable(bodyfunc):
                 body = bodyfunc(page, response=response)
                 if isinstance(body, str):
                     pass
                 else:
                     body = dataencode(body)
         else:
             body = None
         if callable(urlfunc):
             if isinstance(page,tuple):
                 url = urlfunc(*page,response=response)
             else:
                 url = urlfunc(page,response=response)
         else:
             url = response.url
         _meta = response.meta.copy()
         meta = meta if meta else {}
         _meta.update(meta)
         req = _Request(
             url,
             method=method,
             body=body,
             headers=headers,
             meta=_meta,
             priority=priority,
             redis_flag=False,
             redis_conn=None,
             dont_filter=dont_filter,
             callback=callback,
             errback=errback)
         reqs.add(req)
     return reqs
Esempio n. 7
0
    def scrapy_page_help(self,
                         response: Response,
                         config: dict = None,
                         callback: callable = None,
                         headers: dict = None,
                         urlfunc: callable = None,
                         bodyfunc: callable = None,
                         divmod: int = 1,
                         response_type: 'xpath' or 'json' = 'xpath',
                         method: 'GET' or 'POST' = 'GET',
                         flag=False,  # True为下一页翻页,False为生成所有页面
                         pagestart=1,  # 其实页说明
                         redis_flag=False,
                         redis_conn=None,
                         errback=None,
                         cookies=None,
                         offset=1,
                         meta={},
                         readpage=128, # 每次余数 生成nextpages数
                         connect_type: 'urlencode'
                         or 'json' = 'urlencode') -> scrapy.Request:
        '''
        @ params response  parse的response形参
        @ params config  获取total方法参数  调用S.select_content
        @ callback  回调函数
        @ headers 默认为urlencode
        @ urlfunc  常用lambda函数 
        @ connect_type 决定body的encode方法
        @ response_type 决定参数获取方式
        @ method Request method
        @ divmod 获取到total 后计算totalpage的除数
        @ bodyfunc 常用lambda表达式
        return [Requests]
        '''
        _pagestart = response.meta.get('pagestart') or pagestart
        _offset = response.meta.get('offset') or offset
        page = response.meta.get('page') or 1
        dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps
        if not response.meta.get('totalpage'):
            if response_type.lower() == 'json':
                try:
                    JS_response = json.loads(response.text)
                except:
                    JS_response = execjs.eval(response.text) if hasattr(response,'text') else response if isinstance(response,(dict,list)) else {}
            else:
                JS_response = response
        else:
            JS_response = response

        reqs = set()
        logger.info('page'*100)
        # 直接获取最大页码 生成request 更新逻辑为一次生成默认32页,优化内存
        if not flag:
            totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \
                ceil(int(S.select_content(JS_response, config, response)) / divmod) if S.select_content(JS_response, config, response)\
                else 1
            if page < totalpage and not flag:
                _readpage = readpage * _offset
                pagestart = _pagestart % _readpage
                if page % _readpage == pagestart:
                    minpage = min(page + _readpage,totalpage)
                    logger.info('from %s to %s,totalpage is %s' % (page+1,minpage,totalpage))
                    for page in range(page + _offset, minpage + _offset, _offset):
                        if callable(bodyfunc):
                            body = bodyfunc(page, response=response)
                            if isinstance(body, str):
                                pass
                            else:
                                body = dataencode(body)
                        else:
                            body = None
                        if callable(urlfunc):
                            url = urlfunc(page,response=response)
                        else:
                            url = response.url
                        _meta = response.meta.copy()
                        _meta.update({'page': page,
                                  'pagestart':_pagestart,
                                  'totalpage': totalpage,
                                  'offset':_offset})
                        req = _Request(
                            url,
                            method=method,
                            body=body,
                            headers=headers,
                            redis_flag=redis_flag,
                            redis_conn=redis_conn,
                            errback=errback,
                            cookies=cookies,
                            meta=_meta,
                            callback=callback)
                        reqs.add(req)
            elif page > totalpage and not flag:
                _readpage = readpage * _offset
                pagestart = _pagestart % _readpage
                if page % _readpage == pagestart:
                    minpage = max(page-_readpage,totalpage)
                    logger.info('from %s to %s,totalpage is %s' % (page,minpage,totalpage))
                    for page in range(minpage, page):
                        if callable(bodyfunc):
                            body = bodyfunc(page, response=response)
                            if isinstance(body, str):
                                pass
                            else:
                                body = dataencode(body)
                        else:
                            body = None
                        if callable(urlfunc):
                            url = urlfunc(page,response=response)
                        else:
                            url = response.url
                        _meta = response.meta.copy()
                        _meta.update({'page': page,
                                  'pagestart':_pagestart,
                                  'totalpage': totalpage,
                                  'offset':_offset})
                        req = _Request(
                            url,
                            method=method,
                            body=body,
                            headers=headers,
                            redis_flag=redis_flag,
                            redis_conn=redis_conn,
                            errback=errback,
                            cookies=cookies,
                            meta=_meta,
                            callback=callback)
                        reqs.add(req)
        # 下一页判断 默认生成32页 翻页 
        elif flag:

            if S.select_content(JS_response, config):
                _readpage = readpage * _offset
                pagestart = _pagestart % _readpage
                if page % _readpage == pagestart:
                    logger.info('from %s to %s,totalpage is undefind' % (page+1,page+readpage))
                    for _page in range(page + 1, page+_readpage+1):
                        if callable(urlfunc):
                            url = urlfunc(_page,response=response)
                        else:
                            url = response.url
                        if callable(bodyfunc):
                            body = bodyfunc(_page, response=response)
                            if isinstance(body, str):
                                pass
                            else:
                                body = dataencode(body)
                        else:
                                body = None
                        _meta = response.meta.copy()
                        _meta.update({'page': _page,
                                  'pagestart':_pagestart,
                                  'offset':_offset})
                        req = _Request(
                            url,
                            method=method,
                            body=body,
                            headers=headers,
                            meta=_meta,
                            redis_flag=redis_flag,
                            redis_conn=redis_conn,
                            callback=callback,
                            errback=errback)
                        reqs.add(req)
            else:
                # logger.error(response.text)
                with open('1.html','wb') as f:
                    f.write(response.body)
        return reqs
Esempio n. 8
0
 def scrapy_page_help(self,
                      response: Response,
                      config: dict = None,
                      callback: callable = None,
                      headers: dict = None,
                      urlfunc: callable = None,
                      bodyfunc: callable = None,
                      divmod: int = 1,
                      response_type: 'xpath' or 'json' = 'xpath',
                      method: 'GET' or 'POST' = 'GET',
                      flag=False,  # True为下一页翻页,False为生成所有页面
                      pagestart=1,  # 其实页说明
                      connect_type: 'urlencode'
                      or 'json' = 'urlencode') -> scrapy.Request:
     '''
     @ params response  parse的response形参
     @ params config  获取total方法参数  调用S.select_content
     @ callback  回调函数
     @ headers 默认为urlencode
     @ urlfunc  常用lambda函数 
     @ connect_type 决定body的encode方法
     @ response_type 决定参数获取方式
     @ method Request method
     @ divmod 获取到total 后计算totalpage的除数
     @ bodyfunc 常用lambda表达式
     return [Requests]
     '''
     page = response.meta.get('page', 1)
     if page != pagestart or flag:
         return []
     dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.loads
     if response_type.lower() == 'json':
         JS_response = json.loads(response.text)
     else:
         JS_response = response
     totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \
         ceil(int(S.select_content(JS_response, config)) / divmod) if S.select_content(JS_response, config)\
         else 1
     reqs = set()
     logger.info('totalpage is %s' % totalpage)
     # 直接获取最大页码 生成request
     if page < totalpage and not flag:
         for page in range(page + 1, totalpage + 1):
             if callable(bodyfunc):
                 body = bodyfunc(page, response=response)
                 if isinstance(body, str):
                     pass
                 else:
                     body = dataencode(body)
             else:
                 body = None
             if callable(urlfunc):
                 url = urlfunc(page)
             else:
                 url = response.url
             req = scrapy.Request(
                 url,
                 method=method,
                 body=body,
                 headers=headers,
                 meta={'page': page,
                       'totalpage': totalpage},
                 callback=callback)
             reqs.add(req)
     # 下一页判断翻页 
     elif page < totalpage and flag:
         if S.select_content(JS_response, config):
             page += 1
             req = scrapy.Request(
                 url,
                 method=method,
                 body=body,
                 headers=headers,
                 meta={'page': page,
                       'totalpage': totalpage},
                 callback=callback)
             reqs.add(req)
     return reqs