def parse_episode_data(self, response): episodeDataSelector = HtmlXPathSelector(response) dataInitial = response.meta['item'] data = imdbItem() data['link'] = response.url.strip() data["seriesRating"] = dataInitial["seriesRating"][0].strip() data['showName'] = episodeDataSelector.xpath( '//h2[@class="tv_header"]/a/text()').extract()[0].strip() data['episode'] = episodeDataSelector.xpath( '//h2[@class="tv_header"]//span[@class="nobr"]/text()').extract( )[0].strip() data['episodeRating'] = episodeDataSelector.xpath( '//span[@itemprop="ratingValue"]/text()').extract()[0].strip() data['votes'] = serializeToInt( episodeDataSelector.xpath( '//span[@itemprop="ratingCount"]/text()').extract()[0].strip()) data['genre'] = episodeDataSelector.xpath( '//span[@itemprop="genre"]/text()').extract()[0].strip() data['director'] = episodeDataSelector.xpath( '//div[@itemprop="director"]//span[@itemprop="name"]/text()' ).extract()[0].strip() data['airDate'] = process_date( episodeDataSelector.xpath( '//div[@id="title-overview-widget"]//h1[@class="header"]//span[@class="nobr"]/text()' ).extract()[0].strip()) data['videoLink'] = process_link(data['showName'], data['episode']) return data
def parse_items(self, response): hxs = HtmlXPathSelector(response) data = imdbItem() data["seriesRating"] = hxs.xpath( '//span[@itemprop="ratingValue"]/text()').extract() seasonLink = hxs.xpath( '//div[@id="titleTVSeries"]/div[1]//span[@class="see-more inline"]/a/@href' ).extract() #Directly go to ratings page ''' if not seasonLink==[]: #print data["link"] url = data["link"][0]+'epdate' request = Request(url,callback=self.parse_episode_ratings) request.meta['item'] = data yield request ''' #follow season links - can get more data as opposed to above method if not seasonLink == []: for season in seasonLink: link = 'http://www.imdb.com/' + season request = Request(link, callback=self.parse_season_links) request.meta['item'] = data yield request
def parse_items(self, response): hxs = Selector(response) print "came here" data = imdbItem() data["seriesRating"] = hxs.xpath('//span[@itemprop="ratingValue"]/text()').extract() print data["seriesRating"] seasonLink = hxs.xpath('//*[@id="title-episode-widget"]/div/div[3]/a/@href').extract() print seasonLink #Directly go to ratings page ''' if not seasonLink==[]: #print data["link"] url = data["link"][0]+'epdate' request = Request(url,callback=self.parse_episode_ratings) request.meta['item'] = data yield request ''' #follow season links - can get more data as opposed to above method if not seasonLink==[]: for season in seasonLink: link = 'http://www.imdb.com/'+season request = Request(link,callback=self.parse_season_links) request.meta['item'] = data yield request
def parse_episode_data(self,response): episodeDataSelector = Selector(response) dataInitial = response.meta['item'] data = imdbItem() data['link'] = response.url.strip() data["seriesRating"] = dataInitial["seriesRating"][0].strip() data['showName'] = episodeDataSelector.xpath('//h2[@class="tv_header"]/a/text()').extract()[0].strip() data['episode'] = episodeDataSelector.xpath('//h2[@class="tv_header"]//span[@class="nobr"]/text()').extract()[0].strip() data['episodeRating'] = episodeDataSelector.xpath('//span[@itemprop="ratingValue"]/text()').extract()[0].strip() data['votes'] = serializeToInt(episodeDataSelector.xpath('//span[@itemprop="ratingCount"]/text()').extract()[0].strip()) data['genre'] = episodeDataSelector.xpath('//span[@itemprop="genre"]/text()').extract()[0].strip() data['director'] = episodeDataSelector.xpath('//div[@itemprop="director"]//span[@itemprop="name"]/text()').extract()[0].strip() data['airDate'] = process_date(episodeDataSelector.xpath('//div[@id="title-overview-widget"]//h1[@class="header"]//span[@class="nobr"]/text()').extract()[0].strip()) data['videoLink'] = process_link(data['showName'], data['episode']) return data
def parse_episode_ratings(self, response): hxs = HtmlXPathSelector(response) ratingsData = [] ratingsRawData = hxs.xpath('//td[@align="right"]/text()').extract() dataInitial = response.meta['item'] for episode, rating, votes in grouped(ratingsRawData, 3): data = imdbItem() data["title"] = dataInitial["title"] data["link"] = dataInitial["link"] data["seriesRating"] = dataInitial["seriesRating"] data["episode"] = episode.replace(u'\xa0', u'') data["episodeRating"] = rating data["votes"] = votes ratingsData.append(data) return ratingsData
def parse_episode_ratings(self,response): hxs = Selector(response) ratingsData = [] ratingsRawData = hxs.xpath('//td[@align="right"]/text()').extract() dataInitial = response.meta['item'] for episode,rating,votes in grouped(ratingsRawData, 3): data = imdbItem() data["title"] = dataInitial["title"] data["link"] = dataInitial["link"] data["seriesRating"] = dataInitial["seriesRating"] data["episode"] = episode.replace(u'\xa0', u'') data["episodeRating"] = rating data["votes"] = votes ratingsData.append(data) return ratingsData