class Pcpop_le(CrawlSpider): name='pcpop_le' allowed_domains=['pcpop.com'] start_urls=[ 'http://product.pcpop.com/Mobile/01334/', #'http://product.pcpop.com/Mobile/00670_4.html', #'http://product.pcpop.com/Mobile/01100/', ] def process_value(value): #注意位置 _=re.findall(r'\'(\d*)\'',value) #print value if _: value="http://product.pcpop.com/AjaxDataOperate/ProductComment.aspx?CurrentPage={}&F_ProductSN={}&F_SeriesSN={}&F_BrandSN={}".format(*_) #print value return value rules = [ Rule(LE(restrict_xpaths='//div[@class="page2"]',),), Rule( LE( allow='\d+/Index\.html', restrict_xpaths='//ul[@id="ProductList"]/li/div[@class="title"]', ), ), Rule( LE( restrict_xpaths='//div[@id="proComPage"]/span', attrs=('onclick','href',), process_value=process_value, ), ), Rule( LE( restrict_xpaths='//div[@class="title"]', attrs=('href',), unique=False, ), callback='parse_item', ), ] def parse_item(self,response): xp=lambda s:response.xpath(s).extract() name=xp('//ul[@id="proComList"]/li/div[@class="dian"]/a[last()]/text()')[0] item=PhoneItem(url=response.url,) item['product_name']=re.match(r'(.+?)\s',name).group(1) item['product_type']=name item['series']=re.search(r' ([a-zA-Z]+)',name).group(1) item['integration_valuation_score']=xp('//ul[@id="proComList"]/li/div[1]/span//text()')[1] item['evaluation']=' '.join(xp('//ul[@id="proComList"]/li/dl//text()')) item['publish_date']=xp('//ul[@id="proComList"]/li/div[2]/span/text()')[0] yield item
class Tencent(CrawlSpider): custom_settings = { 'DEPTH_LIMIT': 1, } name = "tencent" allowed_domains = ["tencent.com"] start_urls = ["https://hr.tencent.com/position.php"] rules = ( Rule( LE(allow=("start=\d{,4}#a")), follow=True, callback='parse_item', #注意这里不能写成callback=self.parse_item ), ) def parse_item(self, response): print(response.url, response.meta['depth']) '''
class ZOl_inc(CrawlSpider): name = 'zol_inc' allowed_domains = ['zol.com.cn'] start_urls = [ 'http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword=vivo', 'http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword=OPPO', 'http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&manuId=34645&keyword=%D0%A1%C3%D7', 'http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&manuId=1434&keyword=%F7%C8%D7%E5', 'http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&manuId=35005&keyword=%C5%AC%B1%C8%D1%C7', 'http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&manuId=613&keyword=%BB%AA%CE%AA', ] TIME=strftime('%Y%m%d%H%M') SCRAPY_DATE=strftime('%Y-%m-%d') rules = [ Rule( LE( tags=('a'), restrict_xpaths='//div[@class="page"]', deny='isFilter', ), follow=True, callback='parse_num', ), ] def parse_num(self, response): for sel in response.xpath('//div[@class="list-box"]/div[@class="list-item clearfix"]'): num=sel.xpath('div[@class="pro-intro"]/div[@class="special clearfix"]/div[@class="grade"]/span/a/@href').re(r'/\d+/(\d+)/review\.shtml')[0] url='http://detail.zol.com.cn/xhr3_Review_GetListAndPage_order=2%5EisFilter=1%5EproId='+num+'%5Epage=1.html' yield Request(url,callback=self.parse_item,) def parse_item(self, response): sel=Selector(text=response.body.decode('unicode_escape').replace('\/','/')) yesterday=strftime('%Y-%m-%d',localtime(time()-24*3600)) #时间戳格式转换成具体日期 for _ in sel.xpath('//li[@class="comment-item"]/div[@class="comments-list-content"]'): DATE=_.xpath('div[@class="single-score clearfix"]/span/text()').extract()[0] if DATE==yesterday: url=_.xpath('div[@class="comments-content"]/h3/a/@href').extract()[0] yield Request(response.urljoin(url),callback=self.parse_comment) elif DATE<yesterday: return next_page=sel.xpath('//div[@class="page"]/a[last()]/@href').re(r'page=\d+')[0] next_page=re.sub(r'page=\d+',next_page,response.url) yield Request(next_page,callback=self.parse_item) def parse_comment(self,response): xp=lambda s:response.xpath(s).extract() evaluation=re.sub(r'\s','',xp('string(//div[@class="comments-content"])')[0]) score=xp('//ul[@class="score-item clearfix"]/li/span[2]/text()') integration=xp('(//div[@class="comments-score clearfix"]/div)[1]/strong/text()') item=PhoneItem() item['publish_date']=evaluation[:10] item['url']=response.url item['customer_id']=xp('//div[@class="comments-user-name"]//a/text()')[0] item['evaluation']=evaluation[10:] item['integration_valuation_score']=integration[0] if integration else '' ( item['battery_score'], item['screen_score'], item['photo_score'], item['video_entertainment_score'], item['appearance_score'], item['cost_performance_score'], )=score if score else ('','','','','','') yield item
class Zol_plus(CrawlSpider): name='zol_plus' allowed_domains = ['zol.com.cn'] rules = [ #http://detail.zol.com.cn/1140/1139537/review.shtml Rule(LE(allow='/\d+/\d+/review\.shtml'),), #http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword=vivo&page=2 #http://detail.zol.com.cn/xhr3_Review_GetListAndPage_proId=392874%5EisFilter=1%5Eorder=1.html Rule( LE(restrict_xpaths='//div[@class="page"]',), follow=True, #此处follow必须为True,否则不管深度为多少本rule规则不起效,(此处默认为True) process_links='process_links', ), #http://detail.zol.com.cn/1140/1139537/review_0_0_1335425_1.shtml#tagNav Rule( LE( tags=('a'), restrict_xpaths='//div[@class="comments-content"]', allow=r'/\d+/\d+/review', ), callback='parse_comment', ), ] # 由于zol调皮了,所以请求时得添加cookies信息和头信息,才给访问哦 def start_requests(self): urls=['http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword=vivo'] cookie = { 'listSubcateId': '57', 'visited_serachKw': 'vivo', } head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', } for url in urls: yield Request(url, cookies=cookie, dont_filter=True, headers=head) def _parse_response(self, response, callback, cb_kwargs, follow=True): if 'isFilter' in response.url: #watch it!!!!!!! response=response.replace(body=str(response.body.decode('unicode_escape').replace('\/','/'))) # response.body r.content类型是bytes response.text r.text类型是str #super()._parse_response(response, callback, cb_kwargs, follow=True),为什么不行??? if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item def process_links(self,links): regex=compile(r'(?<=&)proId=\d*|(?<=&)page=\d*') for link in links: _=regex.findall(link.url) if 'isFilter' in link.url: #如果没找到,说明link来自start_urls的分页 link.url='http://detail.zol.com.cn/xhr3_Review_GetListAndPage_order=1%5EisFilter=1%5E{}%5E{}.html'.format(*_) yield link def parse_comment(self,response): xp=lambda s:response.xpath(s).extract() name=xp('(//div[@class="breadcrumb"]/a)[4]/text()')[0] #注意括号 evaluation=sub(r'\s','',xp('string(//div[@class="comments-content"])')[0]) #注意\s用法 score=xp('//ul[@class="score-item clearfix"]/li/span[2]/text()') integration=xp('(//div[@class="comments-score clearfix"]/div)[1]/strong/text()') item=PhoneItem(url=response.url) item['product_name'],item['product_type']=name.split(' ',1) #分割1次 item['series']=search(r' ([a-zA-Z]+)',name).group(1) #注意要通用,是group(1) item['customer_id']=xp('//div[@class="comments-user-name"]//a/text()')[0] item['publish_date']=evaluation[:10] item['evaluation']=evaluation[10:] item['integration_valuation_score']=integration[0] if integration else '' ( item['battery_score'], item['screen_score'], item['photo_score'], item['video_entertainment_score'], item['appearance_score'], item['cost_performance_score'], )=score if score else ('','','','','','') yield item
class Mininova(CrawlSpider): name = 'mininova' allowed_domains = ['mininova.org'] start_urls = ['http://www.mininova.org'] rules = ( Rule( LE(restrict_xpaths='//table[1]//a', #allow ='/tor/\d+', ), process_links='process_links', #此处的筛选不常用,在LE中就可以把地址筛选好 process_request='process_request', callback='parse_torrent', follow=True), Rule(LE(restrict_xpaths='//h1//a', ), ), ) def parse(self, response): print('parse') return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) #触发Rule的入口 #第一次由parse调用,后面每次由_response_downloaded调用,形成闭环 def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or ( ) #第一次调用parse_start_url,后面每次调用Rule中指定的callback cb_res = self.process_results(response, cb_res) #给什么返回什么 for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: #如果follow=True,则跟进(默认True) for request_or_item in self._requests_to_follow( response): #开始rule规则 yield request_or_item def _requests_to_follow(self, response): #由_parse_response调用 if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): #遍历rules links = [ l for l in rule.link_extractor.extract_links(response) if l not in seen ] #此处已进行LE筛选,先于process_request if links and rule.process_links: #如果有过滤规则process_links,则对links进行筛选 links = rule.process_links(links) for link in links: #此时link是个对象,包含url和text,且都为需要访问的地址,并将该links加入到seen集合 seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) #实例化Request类 ''' Request实例化默认参数如下 Request(url,callback=None,method='GET',headers=None,body=None,cookies=None,meta=None,encoding='utf-8',priority=0,dont_filter=False,errback=None) ''' r.meta.update(rule=n, link_text=link.text) #添加meta信息 yield rule.process_request( r) #调用rule指定的process_request,并传递Request请求 def _response_downloaded(self, response): #由_requests_to_follow调用 rule = self._rules[response.meta['rule']] #返回rules的第n个rule return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow) #用到了rule的follow和callback规则 def parse_start_url(self, response): print('parse_start_url') for sel in response.xpath('//table[1]//a/@href').extract(): if '/tor/' in sel: print(sel) def process_links(self, links): print('process_links') for link in links: if '/tor/' in link.url: print(link.url, '------------------\n') yield link def process_request(self, request): print('process_request') request.headers['User-Agent'] = "Mozilla/5.0" #request.meta['proxy'] = 'http://183.207.228.11:86' #watch it!!!!! return request def parse_torrent(self, response): print('parse_torrent', response.request.headers['User-Agent'])
class ZOl(CrawlSpider): name = 'zol' allowed_domains = ['zol.com.cn'] # 由于zol调皮了,所以请求时得添加cookies信息和头信息,才给访问哦 def start_requests(self): urls=['http://detail.zol.com.cn/index.php?c=SearchList&subcateId=57&keyword=vivo'] cookie = { 'listSubcateId': '57', 'visited_serachKw': 'vivo', } head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', } for url in urls: yield Request(url, cookies=cookie, dont_filter=True, headers=head) def process_value(value): #注意位置 print(re.sub(r'page','AVATAR',value)) return value #只能是return rules = [ Rule( LE( tags=('a'), #只在本rule下生效 attrs=('href',), restrict_xpaths='//div[@class="page"]', canonicalize=False, #去掉#等并把提交数据名按名称排序等 process_value=process_value, # for testing !!! #process_value=lambda x:x+'###', deny='isFilter', #过滤评论中的每一页,大大减少depth,请求数等,提高效率 #这也说明在第二条rule中请求的页面也可以被本rule使用 ), #callback='parse_item', follow=True, ), Rule( LE(allow='/\d+/\d+/review\.shtml'), callback='parse_item' ), ] def parse_item(self, response): sel=Selector(text=response.body.decode('unicode_escape').replace('\/','/')) # 这一步不能少 regex=re.compile(r'(?<=/)\d+(?=/review\.shtml)|(?<=proId=)\d+') #注意这里断言的用法 proID=regex.search(response.url).group() for url in sel.xpath('//div[@class="comments-content"]/h3/a/@href').extract(): #路径最好不要写相对位置如div[3]等 yield Request(response.urljoin(url),callback=self.parse_comment,meta={'proID':proID}) next_page=sel.xpath('//div[@class="page"]/a[last()]/@href').extract() if next_page: num=re.search(r'page=(\d*)',next_page[0]).group(1) print(num,response.url) #order=1代表综合排序,order=2代表最新发布,默认是1 next_page='http://detail.zol.com.cn/xhr3_Review_GetListAndPage_order=1%5EisFilter=1%5EproId={}%5Epage={}.html'.format(proID,num) yield Request(next_page,callback=self.parse_item) def parse_comment(self,response): xp=lambda s:response.xpath(s).extract() name=xp('(//div[@class="breadcrumb"]/a)[4]/text()')[0] #注意括号 evaluation=re.sub(r'\s','',xp('string(//div[@class="comments-content"])')[0]) #注意\s用法 score=xp('//ul[@class="score-item clearfix"]/li/span[2]/text()') integration=xp('(//div[@class="comments-score clearfix"]/div)[1]/strong/text()') item=PhoneItem(url=response.url,product_id=response.meta['proID']) item['product_name'],item['product_type']=name.split(' ',1) #分割1次 item['series']=re.search(r' ([a-zA-Z]+)',name).group(1) #注意要通用,是group(1) item['customer_id']=xp('//div[@class="comments-user-name"]//a/text()')[0] item['publish_date']=evaluation[:10] item['evaluation']=evaluation[10:] item['integration_valuation_score']=integration[0] if integration else '' ( item['battery_score'], item['screen_score'], item['photo_score'], item['video_entertainment_score'], item['appearance_score'], item['cost_performance_score'], )=score if score else ('','','','','','') yield item