def push_item(json_list, item: DataItem, title, name): """ detail页面的解析函数 :param json_list: :param item: :param title: :param name: :return: """ aitem_list = json_list.get('abstractInfoDTO').get('abstractItemList') for a_item in aitem_list: if a_item.get('indexCnName').find(name) != -1: item.__setattr__(title, ResultItem(title=name, value=a_item.get('value'))) break return item
def parse_not_first_page(self, response): """ 解析请求结果非首页 :param response: :return: """ sipo = response.meta['sipo'] soup = BeautifulSoup(response.body_as_unicode(), 'lxml') itemList = soup.find_all(attrs={"class": "item"}) for item in itemList: data_item = DataItem() itemSoup = BeautifulSoup(item.prettify(), 'lxml') patent_id = itemSoup.find(attrs={'name': 'idHidden'}).get('value') nrdAn = itemSoup.find(attrs={'name': 'nrdAnHidden'}).get('value') nrdPn = itemSoup.find(attrs={'name': 'nrdPnHidden'}).get('value') for crawler in info.crawler_dict.get( url_page_turning.get('crawler_id')): crawler.parse(item.prettify(), data_item, itemSoup) yield self.turn_to_request(int(url_page_turning.get('crawler_id')), patent_id=patent_id, nrdPn=nrdPn, nrdAn=nrdAn, sipo=sipo, data_item=data_item)
def parse_not_first_page(self, response): body = response.body_as_unicode() sipo = response.meta['sipo'] crt_page_result = SearchResultUtil(body) # 处理详情等 for record in crt_page_result.get_searchResultRecord_list(): crt_record = SearchResultRecord(record) nrdAn = crt_record.get_nrdAn() nrdPn = crt_record.get_nrdPn() patent_id = crt_record.get_patent_id() data_item = DataItem() for crawler in info.crawler_dict.get('0'): crawler.parse(body, data_item, patent_id) yield self.turn_to_request(int(url_search.get('crawler_id')), data_item=data_item, nrdAn=nrdAn, nrdPn=nrdPn, patent_id=patent_id, sipo=sipo)
def parse(self, response): body = response.body_as_unicode() sipo = response.meta['sipo'] top_page_result = SearchResultUtil(body) if top_page_result.get_totalCount() == 0: logger.info('共0页') else: page_sum = int(math.ceil(top_page_result.get_totalCount() / top_page_result.get_limit())) logger.info('共 %s 页' % page_sum) if top_page_result.get_executableSearchExp() is None: return # 处理详情等 for record in top_page_result.get_searchResultRecord_list(): crt_record = SearchResultRecord(record) nrdAn = crt_record.get_nrdAn() nrdPn = crt_record.get_nrdPn() patent_id = crt_record.get_patent_id() data_item = DataItem() for crawler in info.crawler_dict.get('0'): crawler.parse(body, data_item, patent_id) yield self.turn_to_request(int(url_search.get('crawler_id')), data_item=data_item, nrdAn=nrdAn, nrdPn=nrdPn, patent_id=patent_id, sipo=sipo) # 处理翻页 for index in range(1, page_sum): formdata = url_page_turning.get('form_data') formdata.__setitem__('resultPagination.start', str(top_page_result.get_limit() * index)) formdata.__setitem__('resultPagination.totalCount', str(top_page_result.get_totalCount())) formdata.__setitem__('searchCondition.searchExp', top_page_result.get_searchExp()) formdata.__setitem__('searchCondition.executableSearchExp', top_page_result.get_executableSearchExp()) yield FormRequest( url=url_page_turning.get('url'), callback=self.parse_not_first_page, method="POST", headers=url_page_turning.get('headers'), formdata=formdata, meta={ 'sipo': sipo } )
def push_item(details_str, item: DataItem, title, name): item.__setattr__(title, ResultItem(title=name, value=details_str)) return item
def parse(self, response): """ 解析请求结果第一页 :param response: :return: """ body = response.body_as_unicode() sipo = response.meta['sipo'] soup = BeautifulSoup(body, 'lxml') # 解析总专利数和专利页码数 page_top = soup.find(attrs={"class": "page_top"}) if page_top == 0: logger.info('共0页') else: page_top_line = page_top.get_text(strip=True) patent_sum = int(page_top_line[page_top_line[2:].find("页") + 3:page_top_line.find("条")]) page_sum = int(math.ceil(patent_sum / 12)) logger.info('共 %s 页' % page_sum) search_en_div = soup.find(id='result_executableSearchExp') if search_en_div is None: return item_list = soup.find_all(attrs={"class": "item"}) for item in item_list: data_item = DataItem() itemSoup = BeautifulSoup(item.prettify(), 'lxml') for crawler in info.crawler_dict.get('0'): crawler.parse(item.prettify(), data_item, itemSoup) patent_id = itemSoup.find(attrs={ 'name': 'idHidden' }).get('value') nrdAn = itemSoup.find(attrs={ 'name': 'nrdAnHidden' }).get('value') nrdPn = itemSoup.find(attrs={ 'name': 'nrdPnHidden' }).get('value') yield self.turn_to_request(int(url_search.get('crawler_id')), data_item=data_item, nrdAn=nrdAn, nrdPn=nrdPn, patent_id=patent_id, sipo=sipo) for index in range(1, page_sum): formdata = url_page_turning.get('form_data') formdata.__setitem__('resultPagination.start', str(12 * index)) formdata.__setitem__('resultPagination.totalCount', str(patent_sum)) formdata.__setitem__('searchCondition.searchExp', sipo.search_exp_cn) formdata.__setitem__('searchCondition.executableSearchExp', search_en_div.get_text()) yield FormRequest(url=url_page_turning.get('url'), callback=self.parse_not_first_page, method="POST", headers=url_page_turning.get('headers'), formdata=formdata, meta={'sipo': sipo})