def item_parse(self, _configs: list, response, response1=None) -> dict: ''' @parsma _configs->字段抓取设置 list @params response->Response @output -->result 字段-值 的字典 ''' if hasattr(response,'url'): response1 = response for configs in _configs: response_change = self.change_response_f_type(configs, response) if configs['list']['v']: _response_copy = S.select_content(response_change, configs['list'], response1) or [] else: if isinstance(response_change, list): _response_copy = response_change else: _response_copy = [response_change] for _response in _response_copy: if not _response: return result = dict() for config in configs['data']: result[config['En']] = S.select_content( _response, config, response1) result[config['En']] = S.replace_all(result[config['En']]) item = self.item_db_parse(configs, result) if item: # 持久化记录item self.state['items_count'] = self.state.get( 'items_count', 0) + 1 yield item
def parse(self, response): request = checkTimeError(response) if request: yield request return False followConfigs = choice(response.url,urlconfigs) try: urls = S.select_content(response,followConfigs,response) for url in urls: url = response.urljoin(url) headers['Referer'] = url yield scrapy.Request(url, headers=headers, callback = self.infoParse) except Exception as e: print(response.text,e) nextPageConfigs = choice(response.url,pageConfigs) try: urls = S.select_content(response,nextPageConfigs,response) for url in urls: print(url) url = response.urljoin(url) headers['Referer'] = url yield scrapy.Request(url, headers = headers) except Exception as e: print(response.text,e)
def getUrl2(response, config, formats=None, formats2=None): urls = S.select_content(response, config, response) if urls: urlList = [] for url in urls: if formats: urlList.append(formats.format(*url)) else: urlList.append(formats2 % url) return urlList
def configParse(self, configs, _response, response=None): item = SzseItem() if isinstance(configs, dict): configs = [configs] for _configs in configs: #迭代可能多个的configs if _configs.get('flag') is None: _configs['flag'] = True if _configs['list']['v'] and _configs['flag']: res = S.select_content(_response, _configs['list']) elif isinstance(_response, list): res = _response else: #list(response) ----让response可迭代 res = [_response] if res: for _res in res: #初始化result result = dict() #遍历每个字段提取 for config in _configs['data']: k = config['En'] result[k] = S.select_content(_res, config, response) result[k] = S.replace_invalid_char(result[k]) result[k] = S.replace_invalid_html_char(result[k]) if _configs.get('clear'): for config in _configs['clear']: k = config['En'] result[k] = S.select_content( result[k], config, response) item['result'] = result item['keys'] = _configs['list']['keys'] item['db'] = _configs['list']['db'] item['conn'] = _configs['list'].get('conn') #传递pipelin处理item字段 if result[_configs['list']['check']]: yield item
def infoParse(self, response): item = CyzoneItem() request = checkTimeError(response) if request: yield request return None item = CyzoneItem() configs = self.configChance(response.url) result = dict() for config in configs['data']: k = config['En'] result[k] = S.select_content(response, config, response) result[k] = S.replace_all(result[k]) item['result'] = result item['keys'] = configs['list']['keys'] item['db'] = configs['list']['db'] if result[configs['list']['check']]: yield item
def scrapy_info_url_help(self, response: Response, config: dict = None, callback: callable = None, errback=None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, meta=None, priority=100, redis_flag=False, redis_conn=None, dont_filter=False, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps if response_type.lower() == 'json': try: JS_response = json.loads(response.text) except: JS_response = execjs.eval(response.text) else: JS_response = response reqs = set() urls = S.select_content(JS_response, config, response) if isinstance(urls, list): pass else: urls = [urls] for page in urls: if not page: return [] if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): if isinstance(page,tuple): url = urlfunc(*page,response=response) else: url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() meta = meta if meta else {} _meta.update(meta) req = _Request( url, method=method, body=body, headers=headers, meta=_meta, priority=priority, redis_flag=False, redis_conn=None, dont_filter=dont_filter, callback=callback, errback=errback) reqs.add(req) return reqs
def scrapy_page_help(self, response: Response, config: dict = None, callback: callable = None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 redis_flag=False, redis_conn=None, errback=None, cookies=None, offset=1, meta={}, readpage=128, # 每次余数 生成nextpages数 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' _pagestart = response.meta.get('pagestart') or pagestart _offset = response.meta.get('offset') or offset page = response.meta.get('page') or 1 dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps if not response.meta.get('totalpage'): if response_type.lower() == 'json': try: JS_response = json.loads(response.text) except: JS_response = execjs.eval(response.text) if hasattr(response,'text') else response if isinstance(response,(dict,list)) else {} else: JS_response = response else: JS_response = response reqs = set() logger.info('page'*100) # 直接获取最大页码 生成request 更新逻辑为一次生成默认32页,优化内存 if not flag: totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \ ceil(int(S.select_content(JS_response, config, response)) / divmod) if S.select_content(JS_response, config, response)\ else 1 if page < totalpage and not flag: _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: minpage = min(page + _readpage,totalpage) logger.info('from %s to %s,totalpage is %s' % (page+1,minpage,totalpage)) for page in range(page + _offset, minpage + _offset, _offset): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() _meta.update({'page': page, 'pagestart':_pagestart, 'totalpage': totalpage, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, redis_flag=redis_flag, redis_conn=redis_conn, errback=errback, cookies=cookies, meta=_meta, callback=callback) reqs.add(req) elif page > totalpage and not flag: _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: minpage = max(page-_readpage,totalpage) logger.info('from %s to %s,totalpage is %s' % (page,minpage,totalpage)) for page in range(minpage, page): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() _meta.update({'page': page, 'pagestart':_pagestart, 'totalpage': totalpage, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, redis_flag=redis_flag, redis_conn=redis_conn, errback=errback, cookies=cookies, meta=_meta, callback=callback) reqs.add(req) # 下一页判断 默认生成32页 翻页 elif flag: if S.select_content(JS_response, config): _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: logger.info('from %s to %s,totalpage is undefind' % (page+1,page+readpage)) for _page in range(page + 1, page+_readpage+1): if callable(urlfunc): url = urlfunc(_page,response=response) else: url = response.url if callable(bodyfunc): body = bodyfunc(_page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None _meta = response.meta.copy() _meta.update({'page': _page, 'pagestart':_pagestart, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, meta=_meta, redis_flag=redis_flag, redis_conn=redis_conn, callback=callback, errback=errback) reqs.add(req) else: # logger.error(response.text) with open('1.html','wb') as f: f.write(response.body) return reqs
def scrapy_page_help(self, response: Response, config: dict = None, callback: callable = None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' page = response.meta.get('page', 1) if page != pagestart or flag: return [] dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.loads if response_type.lower() == 'json': JS_response = json.loads(response.text) else: JS_response = response totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \ ceil(int(S.select_content(JS_response, config)) / divmod) if S.select_content(JS_response, config)\ else 1 reqs = set() logger.info('totalpage is %s' % totalpage) # 直接获取最大页码 生成request if page < totalpage and not flag: for page in range(page + 1, totalpage + 1): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page) else: url = response.url req = scrapy.Request( url, method=method, body=body, headers=headers, meta={'page': page, 'totalpage': totalpage}, callback=callback) reqs.add(req) # 下一页判断翻页 elif page < totalpage and flag: if S.select_content(JS_response, config): page += 1 req = scrapy.Request( url, method=method, body=body, headers=headers, meta={'page': page, 'totalpage': totalpage}, callback=callback) reqs.add(req) return reqs