def parse(self, response): hxs = HtmlXPathSelector(response) # 初始化最近赛事轮次 if self.s_round == 0 and not self.is_over: pre_last_url = "/html/body/div[@class='wrapper_zt']/div[@id='ztg_7']/div[@class='col_1']/div[@id='ztu_8']/div[@class='bd']/div[@class='inner']/div/table[@class='video_ct']/tbody/tr/td/div[@class='video_area']/table/tbody/tr[1]/td/div[@class='video_pic']/a/@href" pre_last = list_last_item(hxs.select(pre_last_url).extract()) self.s_round = get_round('http://v.qq.com/zt2012/italy/italy(\w+).htm', pre_last) if self.s_round >= 0: if self.s_round > 0: next_link = 'http://v.qq.com/zt2012/italy/italy%02d.htm' % self.s_round if not RedisUtil.get(next_link): yield Request(url=next_link, callback=self.parse) # 使用redis保存已经爬取过的URL,避免重复爬行 RedisUtil.set(next_link, next_link) sites = hxs.select("/html/body/div[@class='wrapper_zt']/div[@id='ztg_5']/div[@class='col_1']/div[@id='ztu_7']/div[@class='bd']/div[@class='inner']/div[@id='ztc_2']/div[@id='videoTV']/div[@class='right']/div[@id='videoListBox']/div[@id='videoList']/ul/li") items = [] for site in sites: item = SerieaItem() item['sport_id'] = 4 # 意甲 item['s_round'] = self.s_round + 1 item['title'] = list_first_item(site.select("h2/text()").extract()) url = list_first_item(site.select("div/div[1]/dl/dd/span[@class='rightS']/a[@class='iconWb']/@onclick").extract()) item['url'] = url.lstrip('postToWb(').rstrip(');').split(',')[2].strip("'") item['image'] = list_first_item(site.select("div/div[1]/dl/dt/img/@src").extract()) item['time'] = list_first_item(site.select("div/div[1]/dl/dd/span[@class='time']/text()").extract()).rstrip('"') yield item self.s_round -= 1 if self.s_round == 0: self.is_over = True;
def parse(self, response): hxs = HtmlXPathSelector(response) # 初始化最近赛事轮次 if self.s_round == 0 and not self.is_over: pre_last_url = "/html/body/div[@class='wrap']/div[@class='part04']/div[@class='p04_c clearfix']/ul/li[1]/a[@class='p_a alphaImg']/@href" pre_last = list_last_item(hxs.select(pre_last_url).extract()) pre_last = urljoin(self.video_domain, pre_last) self.s_round = get_round('http://sports.sina.com.cn/video/c/j/csl/2013_(\w+)/index.shtml', pre_last) if self.s_round >= 0: if self.s_round > 0: if self.s_round == 1: next_link = 'http://sports.sina.com.cn/video/c/j/csl/2013_%02d/index.shtml' % self.s_round else: next_link = 'http://sports.sina.com.cn/video/c/j/csl/2013_%d/index.shtml' % self.s_round if not RedisUtil.get(next_link): next_link = urljoin(self.video_domain, next_link) yield Request(url=next_link, callback=self.parse) # 使用redis保存已经爬取过的URL,避免重复爬行 RedisUtil.set(next_link, next_link) sites = hxs.select("/html/body/div[@class='wrap']/div[@class='part01 clearfix']/div[@class='p01_focus']/div[@id='p01_cont01']/div[@class='p01_video_li']/div[@id='p01_video_cont']/ul[@id='p01_video_cont00']/li") items = [] for site in sites: item = CslItem() item['sport_id'] = 5 # 中超 item['s_round'] = self.s_round + 1 item['title'] = list_first_item(site.select("h2/span/a/text()").extract()) item['url'] = list_first_item(site.select("h2/a[@class='a_more']/@href").extract()) item['image'] = list_first_item(site.select("div/blockquote[1]/a[@class='v_a btn_video']/img/@src").extract()) item['time'] = list_first_item(site.select("div/blockquote[1]/a[@class='v_a btn_video']/s/text()").extract()).rstrip('"') yield item self.s_round -= 1 if self.s_round == 0: self.is_over = True;