def parse(self, response): #filename = response.url.split("/")[-2] #filename = "movies" #with open(filename, 'ab') as f: # f.write(response.body) item = MovieItem() entity = json.loads(response.body) movie = entity['data']['movieDetails']['movie'] item['movieid'] = entity['data']['movieDetails']['movieId'] item['moviename'] = movie['title'] item['directors'] = ",".join(movie['directors']) item['actors'] = ",".join(movie['actors']) item['posterPath'] = "http://image.tmdb.org/t/p/w185" + ( movie['posterPath']) item['plotSummary'] = movie['plotSummary'] item['averageratings'] = movie['avgRating'] item['numRatings'] = movie['numRatings'] yield item while self.movie_id < 140215: self.movie_id += 1 url = self.start_urls[0] + str(self.movie_id) yield scrapy.Request(url, dont_filter=True, callback=self.parse)
def parse(self, response): sel = scrapy.selector.Selector(response) sites = sel.xpath('//ul/li/h5') item = MovieItem() for site in sites: item['title'] = site.xpath('a/text()').extract()[0] item['link'] = 'http://www.meijutt.com/' + site.xpath( 'a/@href').extract()[0] yield item
def parse(self, response): movies = Selector(response).xpath( '//*[@id="content"]/div/div[1]/div/div/table') for movie in movies: item = MovieItem() if movie.xpath(".//a"): item['name'] = movie.xpath(".//a/@title").extract()[0].strip() item['link'] = movie.xpath(".//a/@href").extract()[0].strip() item['score'] = movie.xpath( ".//span[re:test(@class,'rating_nums')]/text()").extract( )[0].strip() print("%s %s %s" % (item['name'], item['link'], item['score'])) yield item
def parse_indetail(self, response): item = MovieItem() item['title'] = response.xpath( '//div[@class="title_wrapper"]/h1/text()').extract()[:-1] item['crew'] = response.xpath( '//div[@class ="credit_summary_item"]/a/text()').extract()[:-1] #item['writers']=response.xpath('//div[@class="credit_summary_item"]/span[@itemprop="creator"]/a/span/text()').extract() #item['stars']=response.xpath('//div[@class="credit_summary_item"]/span[@itemprop="actors"]/a/span/text()').extract() item['popularity'] = response.xpath( '//div[@class="titleReviewBarSubItem"]/div/span/text()').extract( )[2][21:-8] return item
def parse(self, response): # movies=response.url.split('/')[-2] # filename=response.url.split('/')[-2] # with open(filename,'wb') as f: # f.write(response.body) sel = scrapy.selector.Selector(response) sites = sel.xpath('//ul[@class="navUl"]/li') items = [] for site in sites: # title=site.xpath('a/text()').extract() # link= # print(title) item = MovieItem() item['title'] = site.xpath('a/text()').extract() items.append(item) return items
def parse_indetail(self, response): item = MovieItem() item['title'] = response.xpath( '//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/h1/text()' ).extract() item['directors'] = response.xpath( '//*[@id="title-overview-widget"]/div[2]/div[1]/div[2]/a/text()' ).extract() item['writers'] = response.xpath( '//*[@id="title-overview-widget"]/div[2]/div[1]/div[3]/a[1]/text()' ).extract() item['stars'] = response.xpath( '//*[@id="title-overview-widget"]/div[2]/div[1]/div[4]/a[1]/text()' ).extract() item['popularity'] = response.xpath( '//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/a/span/text()' ).extract() return item
def parse(self, response): hxs = HtmlXPathSelector(response) actors = hxs.select('//td[@class="nm"]') items = MovieItem() items["movie"] = { "url": response.url, "name": hxs.select("/html/head/title/text()").extract() } items["cast"] = [] for actor in actors: item = TutorialItem() item["name"] = actor.select('a/text()').extract() item["link"] = actor.select('a/@href').extract() print item["name"], item["link"] items["cast"].append(item) items["rank"] = self.rank self.rank = self.rank + 1 return items
def parse(self, response): item = MovieItem() entity = json.loads(response.body) movie = entity['data']['movieDetails']['movie'] item['movieid'] = entity['data']['movieDetails']['movieId'] item['moviename'] = movie['title'] item['directors'] = ','.join(movie['directors']) item['actors'] = ",".join(movie['actors']) item['posterPath'] = movie['posterPath'] item['plotSummary'] = movie['plotSummary'] item['averageratings'] = movie['avgRating'] item['numRatings'] = movie['numRatings'] yield item while self.movie_id < 140215: self.movie_id += 1 url = self.start_urls[0] + str(self.movie_id) yield scrapy.Request(url, dont_filter=True, callback=self.parse, headers=self.headers, cookies=self.cookie)
def parse(self, response): self.wanted_num = 10 #For testing easily,we may not want all these data which could take a very long time~ for sel in response.xpath( "//*[contains(@class,'chart full-width')]/tbody/tr" ): #//TODO==king it seems that IMDB has changed the html structure for these information item = MovieItem() item['Title'] = sel.xpath('td[2]/a/text()').extract()[0] item['Rating'] = sel.xpath('td[3]/strong/text()').extract()[0] #have to use python's re model item['Ranking'] = re.match( r'(^[0-9]+)', sel.xpath('td[2]/text()').extract() [0].__str__().strip()).group(1) item['ReleaseDate'] = sel.xpath('td[2]/span/text()').extract()[0] item['MainPageUrl'] = "http://imdb.com" + sel.xpath( 'td[2]/a/@href').extract()[0] request = scrapy.Request(item['MainPageUrl'], callback=self.parseMovieDetails) request.meta['item'] = item if (int(item['Ranking']) >= self.wanted_num + 1): return yield request