def get_top_box_office_by_year(year, number, debug=False): """ Pull out the 'number' highest-grosing movies of the year. """ NUM_MOVIES_PER_PAGE = 50 sort = 'boxoffice_gross_us' def get_website(start, year): website = 'http://www.imdb.com/search/title?at=0&sort=%s&start=%s&title_type=feature&year=%s,%s' % ( sort, start, year, year) return website n = 1 ret_list = OrderedDict() while n < number: print 'n=%s/%s' % (n, number) url_page = get_website(start=n, year=year) print url_page n += NUM_MOVIES_PER_PAGE # I don't get why, but IMDB barfs when I specify a user agent??? soup = get_soup(url_page, no_user_agent=True) # Match on <td class="number">, which refers to the ranking of the movie all_movies = soup.findAll('td', **{'class': "number"}) for movie in all_movies: title_part = movie.next.next.next.next.next.next.next.next.next.next.next.next.next movie_name = clean_unicode(title_part.next) link = str(title_part['href']) m = re.match('/title/tt(\d+)/', link) groups = m.groups() assert len(groups) == 1 imdb_movie_id = int(groups[0]) _year = title_part.next.next.next.next m = re.match(r'\((\d+)\)', _year) groups = m.groups() assert len(groups) == 1 year = int(groups[0]) ret_list[imdb_movie_id] = movie_name # if only a few movies are requested if len(ret_list) == number: return ret_list return ret_list
def scrape_main_page(self): self.main_page_url = self.get_main_page_url(self.imdb_movie_id) self.main_page_soup = get_soup(self.main_page_url) self.scrape_title() self.scrape_nreviews() self.scrape_release_date() self.scrape_budget() self.scrape_gross() self.scrape_description() self.get_posters()
def get_top_box_office_by_year(year, number, debug=False): """ Pull out the 'number' highest-grosing movies of the year. """ NUM_MOVIES_PER_PAGE=50 sort='boxoffice_gross_us' def get_website(start,year): website='http://www.imdb.com/search/title?at=0&sort=%s&start=%s&title_type=feature&year=%s,%s' % (sort,start,year,year) return website n=1 ret_list=OrderedDict() while n<number: print 'n=%s/%s' % (n,number) url_page = get_website(start=n,year=year) print url_page n+=NUM_MOVIES_PER_PAGE # I don't get why, but IMDB barfs when I specify a user agent??? soup=get_soup(url_page,no_user_agent=True) # Match on <td class="number">, which refers to the ranking of the movie all_movies=soup.findAll('td',**{'class':"number"}) for movie in all_movies: title_part=movie.next.next.next.next.next.next.next.next.next.next.next.next.next movie_name=clean_unicode(title_part.next) link=str(title_part['href']) m=re.match('/title/tt(\d+)/',link) groups=m.groups() assert len(groups)==1 imdb_movie_id=int(groups[0]) _year=title_part.next.next.next.next m=re.match(r'\((\d+)\)',_year) groups=m.groups() assert len(groups)==1 year=int(groups[0]) ret_list[imdb_movie_id]=movie_name # if only a few movies are requested if len(ret_list) == number: return ret_list return ret_list
def get_reviews_from_page(self,imdb_review_url): soup = get_soup(imdb_review_url) # find all reviews on the page # The easiest way si to match on user avatars: all_reviews_html = soup.findAll('img',**{'class':"avatar"}) all_reviews = [] for i in all_reviews_html: try: all_reviews.append(self.get_review_from_page(i,imdb_review_url)) except: print 'Error Reading in review on page %s' % imdb_review_url if self.debug: traceback.print_exc() return all_reviews
def get_reviews_from_page(self, imdb_review_url): soup = get_soup(imdb_review_url) # find all reviews on the page # The easiest way si to match on user avatars: all_reviews_html = soup.findAll('img', **{'class': "avatar"}) all_reviews = [] for i in all_reviews_html: try: all_reviews.append( self.get_review_from_page(i, imdb_review_url)) except: print 'Error Reading in review on page %s' % imdb_review_url if self.debug: traceback.print_exc() return all_reviews
def _get_movie_list(url): soup = get_soup(url) votes=soup.find(text='Votes') current_movie=votes.next.next.next.next.next.next.next.next.next.next.next movies=[current_movie] for i in range(99): current_movie=current_movie.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next movies.append(current_movie) ret=OrderedDict() imdb_movie_id=[] ranking=[] for i,movie in enumerate(movies): m=re.match('/title/tt(\d+)/',movie['href']) imdb_movie_id.append(int(m.groups()[0])) ranking.append(i) return DataFrame({'imdb_movie_id':imdb_movie_id,'ranking':ranking})
def _get_movie_list(url): soup = get_soup(url) votes = soup.find(text='Votes') current_movie = votes.next.next.next.next.next.next.next.next.next.next.next movies = [current_movie] for i in range(99): current_movie = current_movie.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next movies.append(current_movie) ret = OrderedDict() imdb_movie_id = [] ranking = [] for i, movie in enumerate(movies): m = re.match('/title/tt(\d+)/', movie['href']) imdb_movie_id.append(int(m.groups()[0])) ranking.append(i) return DataFrame({'imdb_movie_id': imdb_movie_id, 'ranking': ranking})
def nreviews_on_page(imdb_movie_id, debug=False): """ This static function is necessary for caching. """ main_page_url=IMDBScraper.get_main_page_url(imdb_movie_id) main_page_soup = get_soup(main_page_url) return IMDBScraper._scrape_nreviews(main_page_soup,imdb_movie_id,debug)
def nreviews_on_page(imdb_movie_id, debug=False): """ This static function is necessary for caching. """ main_page_url = IMDBScraper.get_main_page_url(imdb_movie_id) main_page_soup = get_soup(main_page_url) return IMDBScraper._scrape_nreviews(main_page_soup, imdb_movie_id, debug)