def parse_list_page_common(self, response): """ 通用版list页面解析 必要条件: :param response: :return: """ assert 'crawl_key' in response.meta assert 'page_index' in response.meta assert 'param' in response.meta assert 'xpath_of_list' in response.meta['param'] assert 'xpath_of_detail_url' in response.meta['param'] assert 'item_parse_class' in response.meta['param'] list_page_content_md5 = hashlib.md5(response.body).hexdigest() logging.info( 'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]' .format(response.meta['page_index'], response.url, response.status, list_page_content_md5)) logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info)) crawl_key = response.meta['crawl_key'] # 更新状态表记录 # self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active') if not self.crawl_helper.should_continue_page_parse( response, crawl_key, list_page_content_md5): return _item_idx = 0 if response.meta['param']['requests_type'] == "dict": _request = response.text.encode('utf-8') _response_data = json.loads(response.text) for _dictn_num in response.meta['param']["xpath_of_list"]: _response_data = _response_data[_dictn_num] for selector in _response_data: _detail_url = '' try: _item_idx += 1 # http://ggzyjy.sc.gov.cn/jyxx/002004/002004003/20200220/fa95a2be-7763-44b9-b94e-33dae1f82aea.html # /jyxx/002004/002004003/20200220/0b9c5576-6c08-4593-9a2c-44c6ec8e3ff7.html _detail_url = 'http://ggzyjy.sc.gov.cn/' + str(selector[ response.meta['param']["xpath_of_detail_url"]]) _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format( crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) # if loop_break: # return # if item_break: # continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class']( selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param']) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 # self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) # yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url)) else: for _selector_num, selector in enumerate( response.xpath(response.meta['param']['xpath_of_list'])): _detail_url = '' try: _item_idx += 1 _url_id = selector.xpath( response.meta['param'] ['xpath_of_detail_url']).extract_first() _url_id = _url_id.split("'")[1].replace('\\r\\n', '') _detail_url = 'http://ec.ccccltd.cn/PMS/biddetail.shtml?id=' + str( _url_id) _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format( crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) # if loop_break: # return # if item_break: # continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class']( selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param']) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 # self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) # yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url))
def parse_list_page_common(self, response): """ 通用版list页面解析 必要条件: :param response: :return: """ assert 'crawl_key' in response.meta assert 'page_index' in response.meta assert 'param' in response.meta assert 'xpath_of_list' in response.meta['param'] assert 'xpath_of_detail_url' in response.meta['param'] assert 'item_parse_class' in response.meta['param'] list_page_content_md5 = hashlib.md5(response.body).hexdigest() logging.info( 'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]' .format(response.meta['page_index'], response.url, response.status, list_page_content_md5)) logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info)) crawl_key = response.meta['crawl_key'] # 更新状态表记录 self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active') if not self.crawl_helper.should_continue_page_parse( response, crawl_key, list_page_content_md5): return _item_idx = 0 for selector in response.xpath( response.meta['param']['xpath_of_list']): _detail_url = '' try: _item_idx += 1 _detail_url = selector.xpath( response.meta['param'] ['xpath_of_detail_url']).extract_first().replace(" ", "") _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format( crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 loop_break, item_break = self.crawl_helper.should_continue_item_parse( crawl_key, _unq_id) if loop_break: return if item_break: continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class']( selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param']) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url))
def parse_list_page_common(self, response): """ 通用版list页面解析 必要条件: :param response: :return: """ assert 'crawl_key' in response.meta assert 'page_index' in response.meta assert 'param' in response.meta assert 'xpath_of_list' in response.meta['param'] assert 'xpath_of_detail_url' in response.meta['param'] assert 'item_parse_class' in response.meta['param'] list_page_content_md5 = hashlib.md5(response.body).hexdigest() logging.info( 'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]' .format(response.meta['page_index'], response.url, response.status, list_page_content_md5)) logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info)) crawl_key = response.meta['crawl_key'] # 更新状态表记录 # self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active') if not self.crawl_helper.should_continue_page_parse( response, crawl_key, list_page_content_md5): return _item_idx = 0 if response.meta['param']['requests_type'] == "dict": _request = response.text.encode('utf-8') _response_data = json.loads(response.text) # _dict_xpath = response.meta['param']['xpath_of_list'].split("/") # if len(_dict_xpath) > 1: for _dictn_num in response.meta['param']["xpath_of_list"]: _response_data = _response_data[_dictn_num] for selector in _response_data: _detail_url = '' try: _item_idx += 1 # _detail_url = response.urljoin( # selector.xpath(response.meta['param']['xpath_of_detail_url']).extract_first()) _detail_url = response.urljoin(selector[ response.meta['param']['xpath_of_detail_url']]) _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format( crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) # if loop_break: # return # if item_break: # continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class']( selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param']) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 # self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) # yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url)) else: for selector in response.xpath( response.meta['param']['xpath_of_list']): _detail_url = '' try: _item_idx += 1 _detail_url = response.urljoin( selector.xpath( response.meta['param'] ['xpath_of_detail_url']).extract_first()) _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format( crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) # if loop_break: # return # if item_break: # continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class']( selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param']) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 # self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) # yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url))