Esempio n. 1
0
def get_image_urls(response):
    #sgml seems not work.htmlparser slow than lxml.user lxml
    # img_ex = SgmlLinkExtractor(allow=(), tags=('img',), attrs=('src',))
    # img_ex = HtmlParserLinkExtractor(tag='img', attr='src', unique=True)
    img_ex = LxmlParserLinkExtractor(tag='img', attr='src', unique=True)
    links = img_ex.extract_links(response)
    urls = [link.url for link in links]
    return urls
Esempio n. 2
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr')
        texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs]
        # texts = clean_google_texts(texts)
        result_doc = blur_ana_patent(texts)
        classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | '
                             '//td[text()="国际分类号"]/parent::* |'
                             '//td[text()="International Classification"]/parent::*')
        patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract())

        texts1 = [''.join(classification_hxs.select('.//text()').extract())]
        result_doc1 = blur_ana_patent(texts1)
        doc = item['doc']
        doc.update(result_doc)
        doc.update(result_doc1)

        patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract())
        abstract = ''.join(
            hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()')
            .extract())
        description = ''.join(
            hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()')
            .extract())
        claims = ''.join(
            hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()')
            .extract())

        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['description'] = description
        doc['claims'] = claims
        doc['patent_state'] = patent_state
        doc['patent_type'] = ''

        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
        #如果有中文版本,对中文版本进行抓取
        link_ex = LxmlParserLinkExtractor(unique=False)
        links = link_ex.extract_links(response)
        for link in links:
            if link.text in ['Chinese', 'chinese', '中文']:
                request = Request(link.url, callback=self.parse_detail_page)
                doc = {
                    'data_source': 'google专利搜索',
                    'url': link.url,
                }
                cn_item = PatentItem(doc=doc,
                                     next_request=request, list_url=item['list_url'], query=item['query'],
                                     attachments=[], attachment_urls=[])
                yield self.item_or_request(cn_item)
                break