def request(self,url,callback=None,dont_filter=False, method='GET',cookies=None, headers=None,priority=0,meta=None,encoding='utf-8',body=None, redis_flag=False,redis_conn=None): callback = callback if callback else self.parse headers=headers if headers else self.default_header if redis_flag: return _Request(url,callback=callback,dont_filter=dont_filter,body=body,method=method,cookies=cookies, headers=headers,priority=priority,meta=meta,encoding=encoding,redis_flag=redis_flag,redis_conn=self.r) else: return _Request(url,callback=callback,dont_filter=dont_filter,body=body,method=method,cookies=cookies, headers=headers,priority=priority,meta=meta,encoding=encoding)
def scrapy_info_url_help(self, response: Response, config: dict = None, callback: callable = None, errback=None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, meta=None, priority=100, redis_flag=False, redis_conn=None, dont_filter=False, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps if response_type.lower() == 'json': try: JS_response = json.loads(response.text) except: JS_response = execjs.eval(response.text) else: JS_response = response reqs = set() urls = S.select_content(JS_response, config, response) if isinstance(urls, list): pass else: urls = [urls] for page in urls: if not page: return [] if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): if isinstance(page,tuple): url = urlfunc(*page,response=response) else: url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() meta = meta if meta else {} _meta.update(meta) req = _Request( url, method=method, body=body, headers=headers, meta=_meta, priority=priority, redis_flag=False, redis_conn=None, dont_filter=dont_filter, callback=callback, errback=errback) reqs.add(req) return reqs
def scrapy_page_help(self, response: Response, config: dict = None, callback: callable = None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 redis_flag=False, redis_conn=None, errback=None, cookies=None, offset=1, meta={}, readpage=128, # 每次余数 生成nextpages数 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' _pagestart = response.meta.get('pagestart') or pagestart _offset = response.meta.get('offset') or offset page = response.meta.get('page') or 1 dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps if not response.meta.get('totalpage'): if response_type.lower() == 'json': try: JS_response = json.loads(response.text) except: JS_response = execjs.eval(response.text) if hasattr(response,'text') else response if isinstance(response,(dict,list)) else {} else: JS_response = response else: JS_response = response reqs = set() logger.info('page'*100) # 直接获取最大页码 生成request 更新逻辑为一次生成默认32页,优化内存 if not flag: totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \ ceil(int(S.select_content(JS_response, config, response)) / divmod) if S.select_content(JS_response, config, response)\ else 1 if page < totalpage and not flag: _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: minpage = min(page + _readpage,totalpage) logger.info('from %s to %s,totalpage is %s' % (page+1,minpage,totalpage)) for page in range(page + _offset, minpage + _offset, _offset): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() _meta.update({'page': page, 'pagestart':_pagestart, 'totalpage': totalpage, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, redis_flag=redis_flag, redis_conn=redis_conn, errback=errback, cookies=cookies, meta=_meta, callback=callback) reqs.add(req) elif page > totalpage and not flag: _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: minpage = max(page-_readpage,totalpage) logger.info('from %s to %s,totalpage is %s' % (page,minpage,totalpage)) for page in range(minpage, page): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() _meta.update({'page': page, 'pagestart':_pagestart, 'totalpage': totalpage, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, redis_flag=redis_flag, redis_conn=redis_conn, errback=errback, cookies=cookies, meta=_meta, callback=callback) reqs.add(req) # 下一页判断 默认生成32页 翻页 elif flag: if S.select_content(JS_response, config): _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: logger.info('from %s to %s,totalpage is undefind' % (page+1,page+readpage)) for _page in range(page + 1, page+_readpage+1): if callable(urlfunc): url = urlfunc(_page,response=response) else: url = response.url if callable(bodyfunc): body = bodyfunc(_page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None _meta = response.meta.copy() _meta.update({'page': _page, 'pagestart':_pagestart, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, meta=_meta, redis_flag=redis_flag, redis_conn=redis_conn, callback=callback, errback=errback) reqs.add(req) else: # logger.error(response.text) with open('1.html','wb') as f: f.write(response.body) return reqs