def parse_full_text(self, response): item = response.meta['item'] doc = item['doc'] try: json_dic = json.loads(response.body_as_unicode()) if json_dic.get('ErrorInfo', (not None)): raise Exception('response json error') patent = json_dic['Option']['PatentList'][0] except Exception as e: log.msg('parse_full_text error %s' % e) yield self.item_or_request(item) return doc['claims'] = patent.get('CLM', '') doc['description'] = patent.get('DESCR', '') attachments = item['attachments'] attach1 = { 'url': 'http://www.innojoy.com/not_exist/%s_2' % doc.get('dn', ''), 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) doc.pop('dic', None) doc.pop('dn', None) for k, v in doc.items(): doc[k] = common_clean(v, junks=["<font color='red'>", '</font>']) yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) abstract = ''.join(hxs.select('//span[@id="txtAbstr"]//text()').extract()) agent_institution = ''.join(hxs.select('//span[@id="tdANM"]//text()').extract()) claims = ''.join(hxs.select('//span[@id="txtClaim"]//text()').extract()) doc = item['doc'] doc['abstract'] = abstract doc['agent_institution'] = agent_institution doc['claims'] = claims dic = doc['dic'] pno = 'APP%s'%dic['StrANX'] pdf_url = 'http://searchtel.patentstar.com.cn/CPRS2010/Docdb/GetBns.aspx?PNo=%s'%pno next_request = Request(pdf_url,callback=self.parse_pdf) item['next_request'] = next_request attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts1 = hxs.select('//table[@class="tb"]//td//text()').extract() texts2 = hxs.select('//div[@class="t2"]//text()').extract() result_doc1 = blur_ana_patent(texts1) result_doc2 = blur_ana_patent(texts2) patent_name = ''.join(hxs.select('//div[@class="t1"]//text()').extract()) abstract = ''.join(hxs.select('//div[@class="con2"]//text()').extract()) doc = item['doc'] doc.update(result_doc1) doc.update(result_doc2) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['application_number'] = doc['application_number'].lstrip('/专利号: ') attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta["item"] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts = hxs.select('//table[@id="perildical2_dl"]//text()').extract() texts = clean_wanfang_texts(texts) result_doc = blur_ana_patent(texts) abstract = "".join(hxs.select('//div[@class="abstracts"]//text()').extract()) doc = item["doc"] doc.update(result_doc) doc["abstract"] = abstract attachments = item["attachments"] attach1 = { "url": response.url, "data": response.body_as_unicode(), "mime_type": get_mime_type_in_response(response), } attachments.append(attach1) image_urls = get_image_urls(response) item["attachment_urls"] += image_urls # more_url = response.url.replace('_free', '') # next_request = Request(more_url, callback=self.parse_more_page) # item['next_request'] = next_request # hotfix for patent_type patent_type = "".join(hxs.select('//th[contains(.//text(),"专利类型")]/../td//text()').extract()) doc["patent_type"] = patent_type yield self.item_or_request(item)
def media_downloaded(self, response, request, info): item = response.meta['item'] if isinstance(response, TextResponse): data = response.body_as_unicode() else: data = Binary(response.body) attach = { 'url': request.url, 'data': data, 'mime_type': get_mime_type_in_response(response), } attach_id = info.spider.db_adapter.save_attachment(attach) item['attachments'].append(attach_id) return response
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts1 = hxs.select('//span[@class="detailtitle"]//text()').extract() texts2 = hxs.select('//table[@class="datainfo"]//text()').extract() texts3 = hxs.select('//table[@id="PatentContentTable"]//text()').extract() texts1 = clean_soopat_texts(texts1) texts2 = clean_soopat_texts(texts2) texts3 = clean_soopat_texts(texts3) result_doc1 = blur_ana_patent(texts1) result_doc2 = blur_ana_patent(texts2) result_doc3 = blur_ana_patent(texts3) patent_name = ''.join(hxs.select('//span[@class="detailtitle"]/h1//text()').extract()) abstract = ''.join(hxs.select('//td[@class="sum f14"]//text()').extract()) doc = item['doc'] doc.update(result_doc1) doc.update(result_doc2) doc.update(result_doc3) doc['patent_name'] = patent_name doc['abstract'] = abstract attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) img_url = ''.join(hxs.select('//a[@class="jqzoom"]/@href').extract()) image_urls.append(img_url) item['attachment_urls'] += image_urls yield self.item_or_request(item)
def parse_list_page(self, response): query = response.meta['query'] try: result = json.loads(response.body_as_unicode()) if result.get('ErrorInfo', (not None)): self._query_item_limit[query] = 0 else: query_item_limit = result['Option']['Count'] self._query_item_limit[query] = query_item_limit patent_list = result['Option']['PatentList'] except Exception as e: log.msg('parse_list_page error:%s' % e) return for patent in patent_list: # url = 'http://search.innojoy.com.cn/CPRS2010/cn/PatentDetails.html?cic=%s&id=%s&idk=%s' % (cic,id,idk) # url = '' #db = patent.get('DB','') #db_name = patent.get('DBName','') dn = patent.get('DN', '') doc = { 'data_source': 'innojoy专利搜索', 'url': 'http://www.innojoy.com/not_exist/%s_0' % dn, 'applicant_address': patent.get('AR', ''), 'abstract': patent.get('ABST', ''), # '':dic.get('StrAgency',''), 'agent_institution': patent.get('AGC', ''), 'agent_person': patent.get('AGT', ''), 'application_time': patent.get('AD', ''), 'application_number': patent.get('AN', ''), 'applicant': patent.get('PA', ''), 'claims': patent.get('CL', ''), 'inventor': patent.get('INN', ''), 'classification': patent.get('PIC', ''), 'publication_number': patent.get('PNM', ''), 'publication_time': patent.get('PD', ''), 'patent_name': patent.get('TI', ''), 'patent_state': patent.get('LLS', ''), 'patent_type': patent.get('DBName', ''), #--- 'dn': dn, 'dic': patent, } image_urls = [] mp = patent.get('MP', '') if mp: image_urls.append(mp) list_url = response.url query = response.meta.get('query') db = patent.get('DB', '') if not dn or not db: next_request = None else: next_request = self._construct_full_text(dn=dn, db=db) item = PatentItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]) item['attachment_urls'] += image_urls attachments = item['attachments'] attach1 = { 'url': 'http://www.innojoy.com/not_exist/%s_1' % doc.get('dn', ''), 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr') texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs] # texts = clean_google_texts(texts) result_doc = blur_ana_patent(texts) classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | ' '//td[text()="国际分类号"]/parent::* |' '//td[text()="International Classification"]/parent::*') patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract()) texts1 = [''.join(classification_hxs.select('.//text()').extract())] result_doc1 = blur_ana_patent(texts1) doc = item['doc'] doc.update(result_doc) doc.update(result_doc1) patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract()) abstract = ''.join( hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()') .extract()) description = ''.join( hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()') .extract()) claims = ''.join( hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()') .extract()) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['description'] = description doc['claims'] = claims doc['patent_state'] = patent_state doc['patent_type'] = '' attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item) #如果有中文版本,对中文版本进行抓取 link_ex = LxmlParserLinkExtractor(unique=False) links = link_ex.extract_links(response) for link in links: if link.text in ['Chinese', 'chinese', '中文']: request = Request(link.url, callback=self.parse_detail_page) doc = { 'data_source': 'google专利搜索', 'url': link.url, } cn_item = PatentItem(doc=doc, next_request=request, list_url=item['list_url'], query=item['query'], attachments=[], attachment_urls=[]) yield self.item_or_request(cn_item) break