Ejemplo n.º 1
0
def get_image_urls(response):
    #sgml seems not work.htmlparser slow than lxml.user lxml
    # img_ex = SgmlLinkExtractor(allow=(), tags=('img',), attrs=('src',))
    # img_ex = HtmlParserLinkExtractor(tag='img', attr='src', unique=True)
    img_ex = LxmlParserLinkExtractor(tag='img', attr='src', unique=True)
    links = img_ex.extract_links(response)
    urls = [link.url for link in links]
    return urls
Ejemplo n.º 2
0
class BaseSpider(CrawlSpider):
    name = "Base"
    site__id = None
    extractors = None
    allowed_domains = []
    start_urls = []
    session = None
    rules = (
        Rule(RegexLinkExtractor(),callback='parse_item'),
        Rule(SgmlLinkExtractor(),callback='parse_item'),
        Rule(LxmlParserLinkExtractor(),callback='parse_item'),
    )

    def process_results(self, response, results):
        return chain(results, self.parse_item(response))

    def parse_item(self, response):
        for extractor in self.extractors:
            values = {
                'URL_PROD': response.url,
                }
            extract = {}
            for e in extractor(response):
                extract.update(e) # TODO: check relevance if overwriting
            for k,v in extract.iteritems():
                values[k] = v[0]
            name = values.get('NAME_PROD')
            if name:
                yield  ScraperItem(name=name, site=self.site__id, values=values.iteritems())
Ejemplo n.º 3
0
 def __init__(self, skip_text=False):
     lx = LxmlParserLinkExtractor()
     self.linkextractor = FilteringLinkExtractor(lx,
                                                 allow=(),
                                                 deny=(),
                                                 allow_domains=(),
                                                 deny_domains=(),
                                                 restrict_xpaths=(),
                                                 canonicalize=True,
                                                 deny_extensions=None)
     self.skip_text = skip_text
Ejemplo n.º 4
0
    def parse(self, response):
        ret = []
        title = "".join(
            response.xpath(".//span[@id='sites-page-title']/text()").extract())
        #        print "Page Title", title
        url = response.url
        path = url.replace(" ", "").replace("/", "_").replace(":", "")

        content = response.xpath(".//*[@id='sites-canvas-main-content']/table")
        links = []
        domains = []
        #print "Page Content", content

        l = LxmlParserLinkExtractor()
        for e in content:
            for x in e.xpath('//*//@href'):
                y = x.extract()
                links.append(y)
                domres = tldextract.extract(y)
                sd = domres.subdomain
                dm = domres.domain
                tld = domres.suffix
                if dm:
                    #print domres
                    d = "{0}.{1}.{2}".format(sd, dm, tld)
                    if d not in domains:
                        domains.append(d)

        content = "".join(content.extract())
        p = Page(title=title,
                 path=path,
                 url=url,
                 content=content,
                 links=links,
                 domains=domains)
        ret.append(p)

        # spider the other local pages
        for sel in response.xpath('//ul/li/div'):
            title = sel.xpath('a/text()').extract()
            link = sel.xpath('a/@href').extract()
            desc = sel.xpath('text()').extract()
            if link:
                l = self.base + link[0]
                #print title, l, desc
                ret.append(scrapy.http.Request(l))

        return ret
Ejemplo n.º 5
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr')
        texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs]
        # texts = clean_google_texts(texts)
        result_doc = blur_ana_patent(texts)
        classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | '
                             '//td[text()="国际分类号"]/parent::* |'
                             '//td[text()="International Classification"]/parent::*')
        patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract())

        texts1 = [''.join(classification_hxs.select('.//text()').extract())]
        result_doc1 = blur_ana_patent(texts1)
        doc = item['doc']
        doc.update(result_doc)
        doc.update(result_doc1)

        patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract())
        abstract = ''.join(
            hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()')
            .extract())
        description = ''.join(
            hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()')
            .extract())
        claims = ''.join(
            hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()')
            .extract())

        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['description'] = description
        doc['claims'] = claims
        doc['patent_state'] = patent_state
        doc['patent_type'] = ''

        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
        #如果有中文版本,对中文版本进行抓取
        link_ex = LxmlParserLinkExtractor(unique=False)
        links = link_ex.extract_links(response)
        for link in links:
            if link.text in ['Chinese', 'chinese', '中文']:
                request = Request(link.url, callback=self.parse_detail_page)
                doc = {
                    'data_source': 'google专利搜索',
                    'url': link.url,
                }
                cn_item = PatentItem(doc=doc,
                                     next_request=request, list_url=item['list_url'], query=item['query'],
                                     attachments=[], attachment_urls=[])
                yield self.item_or_request(cn_item)
                break