Example #1
0
    def parse_full_text(self, response):
        item = response.meta['item']
        doc = item['doc']
        try:
            json_dic = json.loads(response.body_as_unicode())
            if json_dic.get('ErrorInfo', (not None)):
                raise Exception('response json error')
            patent = json_dic['Option']['PatentList'][0]
        except Exception as e:
            log.msg('parse_full_text error %s' % e)
            yield self.item_or_request(item)
            return
        doc['claims'] = patent.get('CLM', '')
        doc['description'] = patent.get('DESCR', '')
        attachments = item['attachments']
        attach1 = {
            'url': 'http://www.innojoy.com/not_exist/%s_2' % doc.get('dn', ''),
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        doc.pop('dic', None)
        doc.pop('dn', None)
        for k, v in doc.items():
            doc[k] = common_clean(v, junks=["<font color='red'>", '</font>'])

        yield self.item_or_request(item)
Example #2
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        abstract = ''.join(hxs.select('//span[@id="txtAbstr"]//text()').extract())
        agent_institution = ''.join(hxs.select('//span[@id="tdANM"]//text()').extract())
        claims = ''.join(hxs.select('//span[@id="txtClaim"]//text()').extract())

        doc = item['doc']
        doc['abstract'] = abstract
        doc['agent_institution'] = agent_institution
        doc['claims'] = claims

        dic = doc['dic']
        pno = 'APP%s'%dic['StrANX']
        pdf_url = 'http://searchtel.patentstar.com.cn/CPRS2010/Docdb/GetBns.aspx?PNo=%s'%pno
        next_request = Request(pdf_url,callback=self.parse_pdf)
        item['next_request'] = next_request

        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        yield self.item_or_request(item)
Example #3
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts1 = hxs.select('//table[@class="tb"]//td//text()').extract()
        texts2 = hxs.select('//div[@class="t2"]//text()').extract()
        result_doc1 = blur_ana_patent(texts1)
        result_doc2 = blur_ana_patent(texts2)
        patent_name = ''.join(hxs.select('//div[@class="t1"]//text()').extract())
        abstract = ''.join(hxs.select('//div[@class="con2"]//text()').extract())

        doc = item['doc']
        doc.update(result_doc1)
        doc.update(result_doc2)
        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['application_number'] = doc['application_number'].lstrip('/专利号: ')
        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
Example #4
0
    def parse_detail_page(self, response):
        item = response.meta["item"]

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts = hxs.select('//table[@id="perildical2_dl"]//text()').extract()
        texts = clean_wanfang_texts(texts)
        result_doc = blur_ana_patent(texts)

        abstract = "".join(hxs.select('//div[@class="abstracts"]//text()').extract())

        doc = item["doc"]
        doc.update(result_doc)
        doc["abstract"] = abstract
        attachments = item["attachments"]
        attach1 = {
            "url": response.url,
            "data": response.body_as_unicode(),
            "mime_type": get_mime_type_in_response(response),
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item["attachment_urls"] += image_urls
        # more_url = response.url.replace('_free', '')
        # next_request = Request(more_url, callback=self.parse_more_page)
        # item['next_request'] = next_request

        # hotfix for patent_type
        patent_type = "".join(hxs.select('//th[contains(.//text(),"专利类型")]/../td//text()').extract())
        doc["patent_type"] = patent_type

        yield self.item_or_request(item)
Example #5
0
 def media_downloaded(self, response, request, info):
     item = response.meta['item']
     if isinstance(response, TextResponse):
         data = response.body_as_unicode()
     else:
         data = Binary(response.body)
     attach = {
         'url': request.url,
         'data': data,
         'mime_type': get_mime_type_in_response(response),
     }
     attach_id = info.spider.db_adapter.save_attachment(attach)
     item['attachments'].append(attach_id)
     return response
Example #6
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts1 = hxs.select('//span[@class="detailtitle"]//text()').extract()
        texts2 = hxs.select('//table[@class="datainfo"]//text()').extract()
        texts3 = hxs.select('//table[@id="PatentContentTable"]//text()').extract()
        texts1 = clean_soopat_texts(texts1)
        texts2 = clean_soopat_texts(texts2)
        texts3 = clean_soopat_texts(texts3)
        result_doc1 = blur_ana_patent(texts1)
        result_doc2 = blur_ana_patent(texts2)
        result_doc3 = blur_ana_patent(texts3)
        patent_name = ''.join(hxs.select('//span[@class="detailtitle"]/h1//text()').extract())
        abstract = ''.join(hxs.select('//td[@class="sum f14"]//text()').extract())

        doc = item['doc']
        doc.update(result_doc1)
        doc.update(result_doc2)
        doc.update(result_doc3)
        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        img_url = ''.join(hxs.select('//a[@class="jqzoom"]/@href').extract())
        image_urls.append(img_url)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
Example #7
0
    def parse_list_page(self, response):
        query = response.meta['query']

        try:
            result = json.loads(response.body_as_unicode())
            if result.get('ErrorInfo', (not None)):
                self._query_item_limit[query] = 0
            else:
                query_item_limit = result['Option']['Count']
                self._query_item_limit[query] = query_item_limit
            patent_list = result['Option']['PatentList']
        except Exception as e:
            log.msg('parse_list_page error:%s' % e)
            return
        for patent in patent_list:
            # url = 'http://search.innojoy.com.cn/CPRS2010/cn/PatentDetails.html?cic=%s&id=%s&idk=%s' % (cic,id,idk)
            # url = ''
            #db = patent.get('DB','')
            #db_name = patent.get('DBName','')

            dn = patent.get('DN', '')
            doc = {
                'data_source': 'innojoy专利搜索',
                'url': 'http://www.innojoy.com/not_exist/%s_0' % dn,
                'applicant_address': patent.get('AR', ''),
                'abstract': patent.get('ABST', ''),
                # '':dic.get('StrAgency',''),
                'agent_institution': patent.get('AGC', ''),
                'agent_person': patent.get('AGT', ''),
                'application_time': patent.get('AD', ''),
                'application_number': patent.get('AN', ''),
                'applicant': patent.get('PA', ''),
                'claims': patent.get('CL', ''),
                'inventor': patent.get('INN', ''),
                'classification': patent.get('PIC', ''),
                'publication_number': patent.get('PNM', ''),
                'publication_time': patent.get('PD', ''),
                'patent_name': patent.get('TI', ''),
                'patent_state': patent.get('LLS', ''),
                'patent_type': patent.get('DBName', ''),
                #---
                'dn': dn,
                'dic': patent,
            }
            image_urls = []
            mp = patent.get('MP', '')
            if mp:
                image_urls.append(mp)

            list_url = response.url
            query = response.meta.get('query')

            db = patent.get('DB', '')
            if not dn or not db:
                next_request = None
            else:
                next_request = self._construct_full_text(dn=dn, db=db)
            item = PatentItem(doc=doc,
                              next_request=next_request, list_url=list_url, query=query,
                              attachments=[], attachment_urls=[])
            item['attachment_urls'] += image_urls
            attachments = item['attachments']
            attach1 = {
                'url': 'http://www.innojoy.com/not_exist/%s_1' % doc.get('dn', ''),
                'data': response.body_as_unicode(),
                'mime_type': get_mime_type_in_response(response)
            }
            attachments.append(attach1)
            yield self.item_or_request(item)
Example #8
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr')
        texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs]
        # texts = clean_google_texts(texts)
        result_doc = blur_ana_patent(texts)
        classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | '
                             '//td[text()="国际分类号"]/parent::* |'
                             '//td[text()="International Classification"]/parent::*')
        patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract())

        texts1 = [''.join(classification_hxs.select('.//text()').extract())]
        result_doc1 = blur_ana_patent(texts1)
        doc = item['doc']
        doc.update(result_doc)
        doc.update(result_doc1)

        patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract())
        abstract = ''.join(
            hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()')
            .extract())
        description = ''.join(
            hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()')
            .extract())
        claims = ''.join(
            hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()')
            .extract())

        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['description'] = description
        doc['claims'] = claims
        doc['patent_state'] = patent_state
        doc['patent_type'] = ''

        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
        #如果有中文版本,对中文版本进行抓取
        link_ex = LxmlParserLinkExtractor(unique=False)
        links = link_ex.extract_links(response)
        for link in links:
            if link.text in ['Chinese', 'chinese', '中文']:
                request = Request(link.url, callback=self.parse_detail_page)
                doc = {
                    'data_source': 'google专利搜索',
                    'url': link.url,
                }
                cn_item = PatentItem(doc=doc,
                                     next_request=request, list_url=item['list_url'], query=item['query'],
                                     attachments=[], attachment_urls=[])
                yield self.item_or_request(cn_item)
                break