def get_image_urls(response): #sgml seems not work.htmlparser slow than lxml.user lxml # img_ex = SgmlLinkExtractor(allow=(), tags=('img',), attrs=('src',)) # img_ex = HtmlParserLinkExtractor(tag='img', attr='src', unique=True) img_ex = LxmlParserLinkExtractor(tag='img', attr='src', unique=True) links = img_ex.extract_links(response) urls = [link.url for link in links] return urls
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr') texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs] # texts = clean_google_texts(texts) result_doc = blur_ana_patent(texts) classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | ' '//td[text()="国际分类号"]/parent::* |' '//td[text()="International Classification"]/parent::*') patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract()) texts1 = [''.join(classification_hxs.select('.//text()').extract())] result_doc1 = blur_ana_patent(texts1) doc = item['doc'] doc.update(result_doc) doc.update(result_doc1) patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract()) abstract = ''.join( hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()') .extract()) description = ''.join( hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()') .extract()) claims = ''.join( hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()') .extract()) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['description'] = description doc['claims'] = claims doc['patent_state'] = patent_state doc['patent_type'] = '' attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item) #如果有中文版本,对中文版本进行抓取 link_ex = LxmlParserLinkExtractor(unique=False) links = link_ex.extract_links(response) for link in links: if link.text in ['Chinese', 'chinese', '中文']: request = Request(link.url, callback=self.parse_detail_page) doc = { 'data_source': 'google专利搜索', 'url': link.url, } cn_item = PatentItem(doc=doc, next_request=request, list_url=item['list_url'], query=item['query'], attachments=[], attachment_urls=[]) yield self.item_or_request(cn_item) break