Beispiel #1
0
	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		
		# 初始化最近赛事轮次
		if self.s_round == 0 and not self.is_over:	
			pre_last_url = "/html/body/div[@class='wrapper_zt']/div[@id='ztg_7']/div[@class='col_1']/div[@id='ztu_8']/div[@class='bd']/div[@class='inner']/div/table[@class='video_ct']/tbody/tr/td/div[@class='video_area']/table/tbody/tr[1]/td/div[@class='video_pic']/a/@href"	
			pre_last = list_last_item(hxs.select(pre_last_url).extract())
			self.s_round = get_round('http://v.qq.com/zt2012/italy/italy(\w+).htm', pre_last)

		if self.s_round >= 0:
			if self.s_round > 0:
				next_link = 'http://v.qq.com/zt2012/italy/italy%02d.htm' % self.s_round
				if not RedisUtil.get(next_link):
					yield Request(url=next_link, callback=self.parse)
					# 使用redis保存已经爬取过的URL,避免重复爬行
					RedisUtil.set(next_link, next_link)

			sites = hxs.select("/html/body/div[@class='wrapper_zt']/div[@id='ztg_5']/div[@class='col_1']/div[@id='ztu_7']/div[@class='bd']/div[@class='inner']/div[@id='ztc_2']/div[@id='videoTV']/div[@class='right']/div[@id='videoListBox']/div[@id='videoList']/ul/li")
			items = []
			for site in sites:
				item = SerieaItem()
				item['sport_id'] = 4 # 意甲
				item['s_round'] = self.s_round + 1
				item['title'] = list_first_item(site.select("h2/text()").extract())
				url = list_first_item(site.select("div/div[1]/dl/dd/span[@class='rightS']/a[@class='iconWb']/@onclick").extract())
				item['url'] = url.lstrip('postToWb(').rstrip(');').split(',')[2].strip("'")
				item['image'] = list_first_item(site.select("div/div[1]/dl/dt/img/@src").extract())
				item['time'] = list_first_item(site.select("div/div[1]/dl/dd/span[@class='time']/text()").extract()).rstrip('"')

				yield item

			self.s_round -= 1
			if self.s_round == 0:
				self.is_over = True;
Beispiel #2
0
	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		
		# 初始化最近赛事轮次
		if self.s_round == 0 and not self.is_over:	
			pre_last_url = "/html/body/div[@class='wrap']/div[@class='part04']/div[@class='p04_c clearfix']/ul/li[1]/a[@class='p_a alphaImg']/@href"
			pre_last = list_last_item(hxs.select(pre_last_url).extract())
			pre_last = urljoin(self.video_domain, pre_last)
			self.s_round = get_round('http://sports.sina.com.cn/video/c/j/csl/2013_(\w+)/index.shtml', pre_last)

		if self.s_round >= 0:
			if self.s_round > 0:
				if self.s_round == 1:
					next_link = 'http://sports.sina.com.cn/video/c/j/csl/2013_%02d/index.shtml' % self.s_round
				else:
					next_link = 'http://sports.sina.com.cn/video/c/j/csl/2013_%d/index.shtml' % self.s_round
		
				if not RedisUtil.get(next_link):
					next_link = urljoin(self.video_domain, next_link)
					yield Request(url=next_link, callback=self.parse)
					# 使用redis保存已经爬取过的URL,避免重复爬行
					RedisUtil.set(next_link, next_link)
					

			sites = hxs.select("/html/body/div[@class='wrap']/div[@class='part01 clearfix']/div[@class='p01_focus']/div[@id='p01_cont01']/div[@class='p01_video_li']/div[@id='p01_video_cont']/ul[@id='p01_video_cont00']/li")
			items = []
			for site in sites:
				item = CslItem()
				item['sport_id'] = 5 # 中超
				item['s_round'] = self.s_round + 1
				item['title'] = list_first_item(site.select("h2/span/a/text()").extract())
				item['url'] = list_first_item(site.select("h2/a[@class='a_more']/@href").extract())
				item['image'] = list_first_item(site.select("div/blockquote[1]/a[@class='v_a btn_video']/img/@src").extract())
				item['time'] = list_first_item(site.select("div/blockquote[1]/a[@class='v_a btn_video']/s/text()").extract()).rstrip('"')
				yield item

			self.s_round -= 1
			if self.s_round == 0:
				self.is_over = True;