Ejemplo n.º 1
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts1 = hxs.select('//table[@class="tb"]//td//text()').extract()
        texts2 = hxs.select('//div[@class="t2"]//text()').extract()
        result_doc1 = blur_ana_patent(texts1)
        result_doc2 = blur_ana_patent(texts2)
        patent_name = ''.join(hxs.select('//div[@class="t1"]//text()').extract())
        abstract = ''.join(hxs.select('//div[@class="con2"]//text()').extract())

        doc = item['doc']
        doc.update(result_doc1)
        doc.update(result_doc2)
        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['application_number'] = doc['application_number'].lstrip('/专利号: ')
        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
Ejemplo n.º 2
0
    def parse_detail_page(self, response):
        item = response.meta["item"]

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts = hxs.select('//table[@id="perildical2_dl"]//text()').extract()
        texts = clean_wanfang_texts(texts)
        result_doc = blur_ana_patent(texts)

        abstract = "".join(hxs.select('//div[@class="abstracts"]//text()').extract())

        doc = item["doc"]
        doc.update(result_doc)
        doc["abstract"] = abstract
        attachments = item["attachments"]
        attach1 = {
            "url": response.url,
            "data": response.body_as_unicode(),
            "mime_type": get_mime_type_in_response(response),
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item["attachment_urls"] += image_urls
        # more_url = response.url.replace('_free', '')
        # next_request = Request(more_url, callback=self.parse_more_page)
        # item['next_request'] = next_request

        # hotfix for patent_type
        patent_type = "".join(hxs.select('//th[contains(.//text(),"专利类型")]/../td//text()').extract())
        doc["patent_type"] = patent_type

        yield self.item_or_request(item)
Ejemplo n.º 3
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts1 = hxs.select('//span[@class="detailtitle"]//text()').extract()
        texts2 = hxs.select('//table[@class="datainfo"]//text()').extract()
        texts3 = hxs.select('//table[@id="PatentContentTable"]//text()').extract()
        texts1 = clean_soopat_texts(texts1)
        texts2 = clean_soopat_texts(texts2)
        texts3 = clean_soopat_texts(texts3)
        result_doc1 = blur_ana_patent(texts1)
        result_doc2 = blur_ana_patent(texts2)
        result_doc3 = blur_ana_patent(texts3)
        patent_name = ''.join(hxs.select('//span[@class="detailtitle"]/h1//text()').extract())
        abstract = ''.join(hxs.select('//td[@class="sum f14"]//text()').extract())

        doc = item['doc']
        doc.update(result_doc1)
        doc.update(result_doc2)
        doc.update(result_doc3)
        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        img_url = ''.join(hxs.select('//a[@class="jqzoom"]/@href').extract())
        image_urls.append(img_url)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
Ejemplo n.º 4
0
 def parse_images(self, response):
     item = response.meta['item']
     image_urls = get_image_urls(response)
     item['attachment_urls'] += image_urls
     yield self.item_or_request(item)
Ejemplo n.º 5
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr')
        texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs]
        # texts = clean_google_texts(texts)
        result_doc = blur_ana_patent(texts)
        classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | '
                             '//td[text()="国际分类号"]/parent::* |'
                             '//td[text()="International Classification"]/parent::*')
        patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract())

        texts1 = [''.join(classification_hxs.select('.//text()').extract())]
        result_doc1 = blur_ana_patent(texts1)
        doc = item['doc']
        doc.update(result_doc)
        doc.update(result_doc1)

        patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract())
        abstract = ''.join(
            hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()')
            .extract())
        description = ''.join(
            hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()')
            .extract())
        claims = ''.join(
            hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()')
            .extract())

        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['description'] = description
        doc['claims'] = claims
        doc['patent_state'] = patent_state
        doc['patent_type'] = ''

        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
        #如果有中文版本,对中文版本进行抓取
        link_ex = LxmlParserLinkExtractor(unique=False)
        links = link_ex.extract_links(response)
        for link in links:
            if link.text in ['Chinese', 'chinese', '中文']:
                request = Request(link.url, callback=self.parse_detail_page)
                doc = {
                    'data_source': 'google专利搜索',
                    'url': link.url,
                }
                cn_item = PatentItem(doc=doc,
                                     next_request=request, list_url=item['list_url'], query=item['query'],
                                     attachments=[], attachment_urls=[])
                yield self.item_or_request(cn_item)
                break