def get_image_urls(response): #sgml seems not work.htmlparser slow than lxml.user lxml # img_ex = SgmlLinkExtractor(allow=(), tags=('img',), attrs=('src',)) # img_ex = HtmlParserLinkExtractor(tag='img', attr='src', unique=True) img_ex = LxmlParserLinkExtractor(tag='img', attr='src', unique=True) links = img_ex.extract_links(response) urls = [link.url for link in links] return urls
class BaseSpider(CrawlSpider): name = "Base" site__id = None extractors = None allowed_domains = [] start_urls = [] session = None rules = ( Rule(RegexLinkExtractor(),callback='parse_item'), Rule(SgmlLinkExtractor(),callback='parse_item'), Rule(LxmlParserLinkExtractor(),callback='parse_item'), ) def process_results(self, response, results): return chain(results, self.parse_item(response)) def parse_item(self, response): for extractor in self.extractors: values = { 'URL_PROD': response.url, } extract = {} for e in extractor(response): extract.update(e) # TODO: check relevance if overwriting for k,v in extract.iteritems(): values[k] = v[0] name = values.get('NAME_PROD') if name: yield ScraperItem(name=name, site=self.site__id, values=values.iteritems())
def __init__(self, skip_text=False): lx = LxmlParserLinkExtractor() self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), canonicalize=True, deny_extensions=None) self.skip_text = skip_text
def parse(self, response): ret = [] title = "".join( response.xpath(".//span[@id='sites-page-title']/text()").extract()) # print "Page Title", title url = response.url path = url.replace(" ", "").replace("/", "_").replace(":", "") content = response.xpath(".//*[@id='sites-canvas-main-content']/table") links = [] domains = [] #print "Page Content", content l = LxmlParserLinkExtractor() for e in content: for x in e.xpath('//*//@href'): y = x.extract() links.append(y) domres = tldextract.extract(y) sd = domres.subdomain dm = domres.domain tld = domres.suffix if dm: #print domres d = "{0}.{1}.{2}".format(sd, dm, tld) if d not in domains: domains.append(d) content = "".join(content.extract()) p = Page(title=title, path=path, url=url, content=content, links=links, domains=domains) ret.append(p) # spider the other local pages for sel in response.xpath('//ul/li/div'): title = sel.xpath('a/text()').extract() link = sel.xpath('a/@href').extract() desc = sel.xpath('text()').extract() if link: l = self.base + link[0] #print title, l, desc ret.append(scrapy.http.Request(l)) return ret
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts_hxs = hxs.select('//table[contains(@class,"patent-bibdata")]//tr') texts = [''.join(hxs.select('.//text()').extract()) for hxs in texts_hxs] # texts = clean_google_texts(texts) result_doc = blur_ana_patent(texts) classification_hxs = hxs.select('//td[text()="國際專利分類號"]/parent::* | ' '//td[text()="国际分类号"]/parent::* |' '//td[text()="International Classification"]/parent::*') patent_state = ''.join(hxs.select('//td[text()="出版類型"]/../td[2]//text()').extract()) texts1 = [''.join(classification_hxs.select('.//text()').extract())] result_doc1 = blur_ana_patent(texts1) doc = item['doc'] doc.update(result_doc) doc.update(result_doc1) patent_name = ''.join(hxs.select('//span[@class="patent-title"]//text()').extract()) abstract = ''.join( hxs.select('//div[@class="patent-section patent-abstract-section"]//div[@class="patent-text"]//text()') .extract()) description = ''.join( hxs.select('//div[@class="patent-section patent-description-section"]//div[@class="patent-text"]//text()') .extract()) claims = ''.join( hxs.select('//div[@class="patent-section patent-claims-section"]//div[@class="patent-text"]//text()') .extract()) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['description'] = description doc['claims'] = claims doc['patent_state'] = patent_state doc['patent_type'] = '' attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item) #如果有中文版本,对中文版本进行抓取 link_ex = LxmlParserLinkExtractor(unique=False) links = link_ex.extract_links(response) for link in links: if link.text in ['Chinese', 'chinese', '中文']: request = Request(link.url, callback=self.parse_detail_page) doc = { 'data_source': 'google专利搜索', 'url': link.url, } cn_item = PatentItem(doc=doc, next_request=request, list_url=item['list_url'], query=item['query'], attachments=[], attachment_urls=[]) yield self.item_or_request(cn_item) break