예제 #1
0
 def request(self,url,callback=None,dont_filter=False,
     method='GET',cookies=None,
     headers=None,priority=0,meta=None,encoding='utf-8',body=None,
     redis_flag=False,redis_conn=None):
     callback = callback if callback else self.parse
     headers=headers if headers else self.default_header
     if redis_flag:
         return _Request(url,callback=callback,dont_filter=dont_filter,body=body,method=method,cookies=cookies,
             headers=headers,priority=priority,meta=meta,encoding=encoding,redis_flag=redis_flag,redis_conn=self.r)
     else:
         return _Request(url,callback=callback,dont_filter=dont_filter,body=body,method=method,cookies=cookies,
             headers=headers,priority=priority,meta=meta,encoding=encoding)
예제 #2
0
 def scrapy_info_url_help(self,
                      response: Response,
                      config: dict = None,
                      callback: callable = None,
                      errback=None,
                      headers: dict = None,
                      urlfunc: callable = None,
                      bodyfunc: callable = None,
                      divmod: int = 1,
                      meta=None,
                      priority=100,
                      redis_flag=False,
                      redis_conn=None,
                      dont_filter=False,
                      response_type: 'xpath' or 'json' = 'xpath',
                      method: 'GET' or 'POST' = 'GET',
                      flag=False,  # True为下一页翻页,False为生成所有页面
                      pagestart=1,  # 其实页说明
                      connect_type: 'urlencode'
                      or 'json' = 'urlencode') -> scrapy.Request:
     '''
     @ params response  parse的response形参
     @ params config  获取total方法参数  调用S.select_content
     @ callback  回调函数
     @ headers 默认为urlencode
     @ urlfunc  常用lambda函数 
     @ connect_type 决定body的encode方法
     @ response_type 决定参数获取方式
     @ method Request method
     @ divmod 获取到total 后计算totalpage的除数
     @ bodyfunc 常用lambda表达式
     return [Requests]
     '''
     dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps
     if response_type.lower() == 'json':
         try:
             JS_response = json.loads(response.text)
         except:
             JS_response = execjs.eval(response.text)
     else:
         JS_response = response
     reqs = set()
     urls = S.select_content(JS_response, config, response)
     if isinstance(urls, list):
         pass
     else:
         urls = [urls]
     for page in urls:
         if not page:
             return []
         if callable(bodyfunc):
                 body = bodyfunc(page, response=response)
                 if isinstance(body, str):
                     pass
                 else:
                     body = dataencode(body)
         else:
             body = None
         if callable(urlfunc):
             if isinstance(page,tuple):
                 url = urlfunc(*page,response=response)
             else:
                 url = urlfunc(page,response=response)
         else:
             url = response.url
         _meta = response.meta.copy()
         meta = meta if meta else {}
         _meta.update(meta)
         req = _Request(
             url,
             method=method,
             body=body,
             headers=headers,
             meta=_meta,
             priority=priority,
             redis_flag=False,
             redis_conn=None,
             dont_filter=dont_filter,
             callback=callback,
             errback=errback)
         reqs.add(req)
     return reqs
예제 #3
0
    def scrapy_page_help(self,
                         response: Response,
                         config: dict = None,
                         callback: callable = None,
                         headers: dict = None,
                         urlfunc: callable = None,
                         bodyfunc: callable = None,
                         divmod: int = 1,
                         response_type: 'xpath' or 'json' = 'xpath',
                         method: 'GET' or 'POST' = 'GET',
                         flag=False,  # True为下一页翻页,False为生成所有页面
                         pagestart=1,  # 其实页说明
                         redis_flag=False,
                         redis_conn=None,
                         errback=None,
                         cookies=None,
                         offset=1,
                         meta={},
                         readpage=128, # 每次余数 生成nextpages数
                         connect_type: 'urlencode'
                         or 'json' = 'urlencode') -> scrapy.Request:
        '''
        @ params response  parse的response形参
        @ params config  获取total方法参数  调用S.select_content
        @ callback  回调函数
        @ headers 默认为urlencode
        @ urlfunc  常用lambda函数 
        @ connect_type 决定body的encode方法
        @ response_type 决定参数获取方式
        @ method Request method
        @ divmod 获取到total 后计算totalpage的除数
        @ bodyfunc 常用lambda表达式
        return [Requests]
        '''
        _pagestart = response.meta.get('pagestart') or pagestart
        _offset = response.meta.get('offset') or offset
        page = response.meta.get('page') or 1
        dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps
        if not response.meta.get('totalpage'):
            if response_type.lower() == 'json':
                try:
                    JS_response = json.loads(response.text)
                except:
                    JS_response = execjs.eval(response.text) if hasattr(response,'text') else response if isinstance(response,(dict,list)) else {}
            else:
                JS_response = response
        else:
            JS_response = response

        reqs = set()
        logger.info('page'*100)
        # 直接获取最大页码 生成request 更新逻辑为一次生成默认32页,优化内存
        if not flag:
            totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \
                ceil(int(S.select_content(JS_response, config, response)) / divmod) if S.select_content(JS_response, config, response)\
                else 1
            if page < totalpage and not flag:
                _readpage = readpage * _offset
                pagestart = _pagestart % _readpage
                if page % _readpage == pagestart:
                    minpage = min(page + _readpage,totalpage)
                    logger.info('from %s to %s,totalpage is %s' % (page+1,minpage,totalpage))
                    for page in range(page + _offset, minpage + _offset, _offset):
                        if callable(bodyfunc):
                            body = bodyfunc(page, response=response)
                            if isinstance(body, str):
                                pass
                            else:
                                body = dataencode(body)
                        else:
                            body = None
                        if callable(urlfunc):
                            url = urlfunc(page,response=response)
                        else:
                            url = response.url
                        _meta = response.meta.copy()
                        _meta.update({'page': page,
                                  'pagestart':_pagestart,
                                  'totalpage': totalpage,
                                  'offset':_offset})
                        req = _Request(
                            url,
                            method=method,
                            body=body,
                            headers=headers,
                            redis_flag=redis_flag,
                            redis_conn=redis_conn,
                            errback=errback,
                            cookies=cookies,
                            meta=_meta,
                            callback=callback)
                        reqs.add(req)
            elif page > totalpage and not flag:
                _readpage = readpage * _offset
                pagestart = _pagestart % _readpage
                if page % _readpage == pagestart:
                    minpage = max(page-_readpage,totalpage)
                    logger.info('from %s to %s,totalpage is %s' % (page,minpage,totalpage))
                    for page in range(minpage, page):
                        if callable(bodyfunc):
                            body = bodyfunc(page, response=response)
                            if isinstance(body, str):
                                pass
                            else:
                                body = dataencode(body)
                        else:
                            body = None
                        if callable(urlfunc):
                            url = urlfunc(page,response=response)
                        else:
                            url = response.url
                        _meta = response.meta.copy()
                        _meta.update({'page': page,
                                  'pagestart':_pagestart,
                                  'totalpage': totalpage,
                                  'offset':_offset})
                        req = _Request(
                            url,
                            method=method,
                            body=body,
                            headers=headers,
                            redis_flag=redis_flag,
                            redis_conn=redis_conn,
                            errback=errback,
                            cookies=cookies,
                            meta=_meta,
                            callback=callback)
                        reqs.add(req)
        # 下一页判断 默认生成32页 翻页 
        elif flag:

            if S.select_content(JS_response, config):
                _readpage = readpage * _offset
                pagestart = _pagestart % _readpage
                if page % _readpage == pagestart:
                    logger.info('from %s to %s,totalpage is undefind' % (page+1,page+readpage))
                    for _page in range(page + 1, page+_readpage+1):
                        if callable(urlfunc):
                            url = urlfunc(_page,response=response)
                        else:
                            url = response.url
                        if callable(bodyfunc):
                            body = bodyfunc(_page, response=response)
                            if isinstance(body, str):
                                pass
                            else:
                                body = dataencode(body)
                        else:
                                body = None
                        _meta = response.meta.copy()
                        _meta.update({'page': _page,
                                  'pagestart':_pagestart,
                                  'offset':_offset})
                        req = _Request(
                            url,
                            method=method,
                            body=body,
                            headers=headers,
                            meta=_meta,
                            redis_flag=redis_flag,
                            redis_conn=redis_conn,
                            callback=callback,
                            errback=errback)
                        reqs.add(req)
            else:
                # logger.error(response.text)
                with open('1.html','wb') as f:
                    f.write(response.body)
        return reqs