def parse(self, response): request = checkTimeError(response) if request: yield request return False followConfigs = choice(response.url,urlconfigs) try: urls = S.select_content(response,followConfigs,response) for url in urls: url = response.urljoin(url) headers['Referer'] = url yield scrapy.Request(url, headers=headers, callback = self.infoParse) except Exception as e: print(response.text,e) nextPageConfigs = choice(response.url,pageConfigs) try: urls = S.select_content(response,nextPageConfigs,response) for url in urls: print(url) url = response.urljoin(url) headers['Referer'] = url yield scrapy.Request(url, headers = headers) except Exception as e: print(response.text,e)
def item_parse(self, _configs: list, response, response1=None) -> dict: ''' @parsma _configs->字段抓取设置 list @params response->Response @output -->result 字段-值 的字典 ''' if hasattr(response,'url'): response1 = response for configs in _configs: response_change = self.change_response_f_type(configs, response) if configs['list']['v']: _response_copy = S.select_content(response_change, configs['list'], response1) or [] else: if isinstance(response_change, list): _response_copy = response_change else: _response_copy = [response_change] for _response in _response_copy: if not _response: return result = dict() for config in configs['data']: result[config['En']] = S.select_content( _response, config, response1) result[config['En']] = S.replace_all(result[config['En']]) item = self.item_db_parse(configs, result) if item: # 持久化记录item self.state['items_count'] = self.state.get( 'items_count', 0) + 1 yield item
def chufaparse(self, response): items = self.configParse(chufaConfigs, response, response) for item in items.__iter__(): _item = item url = re.search("\((.*?)\)", _item['result']['contents']).group(1).strip() url = response.urljoin(url) try: content = S._txtparse(url) content = S.replace_invalid_char(content) except: content = None _item['result']['contents'] = content _item['result']['url'] = url yield _item page, totalpage, counts = getTotalPage(response) if page < totalpage: page += 1 url = 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1759_cxda&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT={totalpage}&tab1RECORDCOUNT={counts}&REPORT_ACTION=navigate'.format( page=page, totalpage=totalpage, counts=counts) yield scrapy.Request(url, headers=hdr(), meta={ 'page': page, 'totalpage': totalpage, 'counts': counts }, callback=self.chufaparse, priority=1)
def infoParse(self, response): item = CyzoneItem() request = checkTimeError(response) if request: yield request return None item = CyzoneItem() configs = self.configChance(response.url) result = dict() for config in configs['data']: k = config['En'] result[k] = S.select_content(response, config, response) result[k] = S.replace_all(result[k]) item['result'] = result item['keys'] = configs['list']['keys'] item['db'] = configs['list']['db'] if result[configs['list']['check']]: yield item
def getUrl2(response, config, formats=None, formats2=None): urls = S.select_content(response, config, response) if urls: urlList = [] for url in urls: if formats: urlList.append(formats.format(*url)) else: urlList.append(formats2 % url) return urlList
def zrdsinfoparse(self, response): item = SzseItem() result = response.meta['result'] result['ins'] = "".join( response.xpath( '//span[@id="ViewResume1_lblContent"]/text()').extract()) result['ins'] = S.replace_invalid_char(result['ins']) item['result'] = result item['db'] = response.meta['db'] item['keys'] = response.meta['keys'] yield item
def infoParse(self,response): request = checkTimeError(response) if request: yield request return False __ = S.replace_invalid_html_char(response.text) response = response.replace(body = __) # print(re.findall('(?=投[^>]*?资[^>]*?方).*?<a\s*?href=\".*?show(\d+)\/',response.text,re.S)) InfoConfigs = choice(response.url,contentsConfigs) items = self.configParse(InfoConfigs,response,response) for item in items.__iter__(): # yield item print(item)
def configParse(self, configs, _response, response=None): item = SzseItem() if isinstance(configs, dict): configs = [configs] for _configs in configs: #迭代可能多个的configs if _configs.get('flag') is None: _configs['flag'] = True if _configs['list']['v'] and _configs['flag']: res = S.select_content(_response, _configs['list']) elif isinstance(_response, list): res = _response else: #list(response) ----让response可迭代 res = [_response] if res: for _res in res: #初始化result result = dict() #遍历每个字段提取 for config in _configs['data']: k = config['En'] result[k] = S.select_content(_res, config, response) result[k] = S.replace_invalid_char(result[k]) result[k] = S.replace_invalid_html_char(result[k]) if _configs.get('clear'): for config in _configs['clear']: k = config['En'] result[k] = S.select_content( result[k], config, response) item['result'] = result item['keys'] = _configs['list']['keys'] item['db'] = _configs['list']['db'] item['conn'] = _configs['list'].get('conn') #传递pipelin处理item字段 if result[_configs['list']['check']]: yield item
def start_requests(self): page = 1 for url in self.start_urls: if url == 'http://www.chinaclear.cn/cms-rank/queryPledgeProportion?queryDate={date}&secCde=': dateformat = "%Y.%m.%d" today = datetime.datetime.now().strftime("%Y%m%d") datelist = S.datelist('20100101', today, dateformat) datelist.reverse() for date in datelist: _url = url.format(date=date) yield scrapy.Request(_url, meta={ 'page': page, 'date': date }, headers=hdr(), priority=0)
def scrapy_info_url_help(self, response: Response, config: dict = None, callback: callable = None, errback=None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, meta=None, priority=100, redis_flag=False, redis_conn=None, dont_filter=False, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps if response_type.lower() == 'json': try: JS_response = json.loads(response.text) except: JS_response = execjs.eval(response.text) else: JS_response = response reqs = set() urls = S.select_content(JS_response, config, response) if isinstance(urls, list): pass else: urls = [urls] for page in urls: if not page: return [] if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): if isinstance(page,tuple): url = urlfunc(*page,response=response) else: url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() meta = meta if meta else {} _meta.update(meta) req = _Request( url, method=method, body=body, headers=headers, meta=_meta, priority=priority, redis_flag=False, redis_conn=None, dont_filter=dont_filter, callback=callback, errback=errback) reqs.add(req) return reqs
def scrapy_page_help(self, response: Response, config: dict = None, callback: callable = None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 redis_flag=False, redis_conn=None, errback=None, cookies=None, offset=1, meta={}, readpage=128, # 每次余数 生成nextpages数 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' _pagestart = response.meta.get('pagestart') or pagestart _offset = response.meta.get('offset') or offset page = response.meta.get('page') or 1 dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps if not response.meta.get('totalpage'): if response_type.lower() == 'json': try: JS_response = json.loads(response.text) except: JS_response = execjs.eval(response.text) if hasattr(response,'text') else response if isinstance(response,(dict,list)) else {} else: JS_response = response else: JS_response = response reqs = set() logger.info('page'*100) # 直接获取最大页码 生成request 更新逻辑为一次生成默认32页,优化内存 if not flag: totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \ ceil(int(S.select_content(JS_response, config, response)) / divmod) if S.select_content(JS_response, config, response)\ else 1 if page < totalpage and not flag: _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: minpage = min(page + _readpage,totalpage) logger.info('from %s to %s,totalpage is %s' % (page+1,minpage,totalpage)) for page in range(page + _offset, minpage + _offset, _offset): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() _meta.update({'page': page, 'pagestart':_pagestart, 'totalpage': totalpage, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, redis_flag=redis_flag, redis_conn=redis_conn, errback=errback, cookies=cookies, meta=_meta, callback=callback) reqs.add(req) elif page > totalpage and not flag: _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: minpage = max(page-_readpage,totalpage) logger.info('from %s to %s,totalpage is %s' % (page,minpage,totalpage)) for page in range(minpage, page): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page,response=response) else: url = response.url _meta = response.meta.copy() _meta.update({'page': page, 'pagestart':_pagestart, 'totalpage': totalpage, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, redis_flag=redis_flag, redis_conn=redis_conn, errback=errback, cookies=cookies, meta=_meta, callback=callback) reqs.add(req) # 下一页判断 默认生成32页 翻页 elif flag: if S.select_content(JS_response, config): _readpage = readpage * _offset pagestart = _pagestart % _readpage if page % _readpage == pagestart: logger.info('from %s to %s,totalpage is undefind' % (page+1,page+readpage)) for _page in range(page + 1, page+_readpage+1): if callable(urlfunc): url = urlfunc(_page,response=response) else: url = response.url if callable(bodyfunc): body = bodyfunc(_page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None _meta = response.meta.copy() _meta.update({'page': _page, 'pagestart':_pagestart, 'offset':_offset}) req = _Request( url, method=method, body=body, headers=headers, meta=_meta, redis_flag=redis_flag, redis_conn=redis_conn, callback=callback, errback=errback) reqs.add(req) else: # logger.error(response.text) with open('1.html','wb') as f: f.write(response.body) return reqs
def scrapy_page_help(self, response: Response, config: dict = None, callback: callable = None, headers: dict = None, urlfunc: callable = None, bodyfunc: callable = None, divmod: int = 1, response_type: 'xpath' or 'json' = 'xpath', method: 'GET' or 'POST' = 'GET', flag=False, # True为下一页翻页,False为生成所有页面 pagestart=1, # 其实页说明 connect_type: 'urlencode' or 'json' = 'urlencode') -> scrapy.Request: ''' @ params response parse的response形参 @ params config 获取total方法参数 调用S.select_content @ callback 回调函数 @ headers 默认为urlencode @ urlfunc 常用lambda函数 @ connect_type 决定body的encode方法 @ response_type 决定参数获取方式 @ method Request method @ divmod 获取到total 后计算totalpage的除数 @ bodyfunc 常用lambda表达式 return [Requests] ''' page = response.meta.get('page', 1) if page != pagestart or flag: return [] dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.loads if response_type.lower() == 'json': JS_response = json.loads(response.text) else: JS_response = response totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \ ceil(int(S.select_content(JS_response, config)) / divmod) if S.select_content(JS_response, config)\ else 1 reqs = set() logger.info('totalpage is %s' % totalpage) # 直接获取最大页码 生成request if page < totalpage and not flag: for page in range(page + 1, totalpage + 1): if callable(bodyfunc): body = bodyfunc(page, response=response) if isinstance(body, str): pass else: body = dataencode(body) else: body = None if callable(urlfunc): url = urlfunc(page) else: url = response.url req = scrapy.Request( url, method=method, body=body, headers=headers, meta={'page': page, 'totalpage': totalpage}, callback=callback) reqs.add(req) # 下一页判断翻页 elif page < totalpage and flag: if S.select_content(JS_response, config): page += 1 req = scrapy.Request( url, method=method, body=body, headers=headers, meta={'page': page, 'totalpage': totalpage}, callback=callback) reqs.add(req) return reqs
def start_requests(self): self.Start = { '1110': self.colistparse, '1105': self.fundlistparse, '1273': self.zqparse, } for _url in self.start_urls: page = 1 if _url == 'http://www.szse.cn/szseWeb/FrontController.szse': for CATALOGID, callback in self.Start.items(): postdata = szse_data(page, CATALOGID) meta = {'CATALOGID': CATALOGID, 'page': page, '_url': _url} yield scrapy.Request(_url, callback=callback, method='POST', headers=hdr(), meta=meta, body=postdata) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1837_xxpl&TABKEY=tab1&txtDate={date}&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate': dateformat = "%Y-%m-%d" today = datetime.datetime.now().strftime("%Y%m%d") datelist = S.datelist('20100101', today, dateformat) datelist.reverse() for date in datelist: url = _url.format(page=page, date=date) yield scrapy.Request( url, headers=hdr(), meta={ 'page': page, 'date': date }, callback=self.rzrqparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1900&TABKEY=tab2&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate': url = 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1900&TABKEY=tab2&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate'.format( page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.zrdsparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1901&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.zrdmparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1901&TABKEY=tab2&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.dmzgpxdaparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1900&TABKEY=tab3&tab3PAGENO={page}&tab3PAGECOUNT=&tab3RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.dsrckparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1759_cxda&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.chufaparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1903_detail&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.zhongjiechufaparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1902&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.jiechuxianshoufaparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1902&TABKEY=tab2&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.jiechuxianshou1perfaparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1902&TABKEY=tab3&tab3PAGENO={page}&tab3PAGECOUNT=&tab3RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.jiechuxianshou5perfaparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=sgshqd&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.sghgqdparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1931_zcjhcjxx&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&report_action=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.zcjhcjxxparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1839_zcjhcpxx&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.zcjhcpxxparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1798&TABKEY=tab1&txtKsrq=2000-01-01&txtZzrq=%s&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate' % datetime.datetime.now( ).strftime("%Y-%m-%d"): url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.tfpxxparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=SSGSGMXX&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.fullnamechangeparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=SSGSGMXX&TABKEY=tab2&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.shortnamechangeparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1793_ssgs&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.suspendListingparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1793_ssgs&TABKEY=tab2&tab2PAGENO={page}&tab2PAGECOUNT=&tab2RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.StopListingparse, ) elif _url == 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=xmjdxx&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT=&tab1RECORDCOUNT=&REPORT_ACTION=navigate': url = _url.format(page=page) yield scrapy.Request( url, headers=hdr(), meta={'page': page}, callback=self.projparse, )