def scrape_title(self): _title=self.main_page_soup.findAll('title') assert len(_title)==1 _title=_title[0] _title=clean_unicode(_title.next) f=re.match('(.+) \((\d+)\) - IMDb',_title) groups=f.groups() assert len(groups)==2 self.movie_name,release_year=groups
def scrape_title(self): _title = self.main_page_soup.findAll('title') assert len(_title) == 1 _title = _title[0] _title = clean_unicode(_title.next) f = re.match('(.+) \((\d+)\) - IMDb', _title) groups = f.groups() assert len(groups) == 2 self.movie_name, release_year = groups
def get_top_box_office_by_year(year, number, debug=False): """ Pull out the 'number' highest-grosing movies of the year. """ NUM_MOVIES_PER_PAGE = 50 sort = 'boxoffice_gross_us' def get_website(start, year): website = 'http://www.imdb.com/search/title?at=0&sort=%s&start=%s&title_type=feature&year=%s,%s' % ( sort, start, year, year) return website n = 1 ret_list = OrderedDict() while n < number: print 'n=%s/%s' % (n, number) url_page = get_website(start=n, year=year) print url_page n += NUM_MOVIES_PER_PAGE # I don't get why, but IMDB barfs when I specify a user agent??? soup = get_soup(url_page, no_user_agent=True) # Match on <td class="number">, which refers to the ranking of the movie all_movies = soup.findAll('td', **{'class': "number"}) for movie in all_movies: title_part = movie.next.next.next.next.next.next.next.next.next.next.next.next.next movie_name = clean_unicode(title_part.next) link = str(title_part['href']) m = re.match('/title/tt(\d+)/', link) groups = m.groups() assert len(groups) == 1 imdb_movie_id = int(groups[0]) _year = title_part.next.next.next.next m = re.match(r'\((\d+)\)', _year) groups = m.groups() assert len(groups) == 1 year = int(groups[0]) ret_list[imdb_movie_id] = movie_name # if only a few movies are requested if len(ret_list) == number: return ret_list return ret_list
def get_top_box_office_by_year(year, number, debug=False): """ Pull out the 'number' highest-grosing movies of the year. """ NUM_MOVIES_PER_PAGE=50 sort='boxoffice_gross_us' def get_website(start,year): website='http://www.imdb.com/search/title?at=0&sort=%s&start=%s&title_type=feature&year=%s,%s' % (sort,start,year,year) return website n=1 ret_list=OrderedDict() while n<number: print 'n=%s/%s' % (n,number) url_page = get_website(start=n,year=year) print url_page n+=NUM_MOVIES_PER_PAGE # I don't get why, but IMDB barfs when I specify a user agent??? soup=get_soup(url_page,no_user_agent=True) # Match on <td class="number">, which refers to the ranking of the movie all_movies=soup.findAll('td',**{'class':"number"}) for movie in all_movies: title_part=movie.next.next.next.next.next.next.next.next.next.next.next.next.next movie_name=clean_unicode(title_part.next) link=str(title_part['href']) m=re.match('/title/tt(\d+)/',link) groups=m.groups() assert len(groups)==1 imdb_movie_id=int(groups[0]) _year=title_part.next.next.next.next m=re.match(r'\((\d+)\)',_year) groups=m.groups() assert len(groups)==1 year=int(groups[0]) ret_list[imdb_movie_id]=movie_name # if only a few movies are requested if len(ret_list) == number: return ret_list return ret_list
def get_review_from_page(self,review_soup,imdb_review_url): """ Pull out a single review form an IMDB movie review page. review is a soup object anchored on a reviewer's avatar. """ # Most reviews begin with the text # > "XXX out of XXX found the following review useful:" # we have to back up to find it, but sometimes it doesn't exist _quality_of_review = review_soup.previous.previous.previous.previous m=re.match('(\d+) out of (\d+) people found the following review useful:', str(_quality_of_review)) if m is not None and len(m.groups())==2: groups=m.groups() num_likes = int(groups[0]) num_dislikes = int(groups[1])-int(groups[0]) else: num_likes = num_dislikes = None _title=review_soup.next.next.next review_title=clean_unicode(_title) # the next thing to look for is the review score. # Note that this doesn't not always exist: review_image = _title.next.next if review_image.name == 'img': _review_score=_title.next.next.attrs['alt'] review_score=_review_score.split('/') assert review_score[0].isdigit() and review_score[1].isdigit() review_score=[int(review_score[0]),int(review_score[1])] assert review_score[0] in range(1,11) assert review_score[1]==10 review_score=review_score[0] _reviewer=_title.next.next.next.contents[3].next else: # No user review, jump to reviewer review_score=None _reviewer=_title.next.next.next.next.next.next reviewer_url=_reviewer.previous['href'] m=re.match('/user/ur(\d+)/',reviewer_url) groups=m.groups() assert len(groups)==1 imdb_reviewer_id = int(groups[0]) if _reviewer == ' ': # for some reason, I think some reviewers don't have # a reviewer name. I found this problem here: # http://www.imdb.com/title/tt1408101/reviews?start=120 reviewer=None _review_place=_reviewer.next.next elif hasattr(_reviewer,'name') and _reviewer.name == 'br': # this happens when there is no reviewer and no place! # This happend at: http://www.imdb.com/title/tt1392170/reviews?start=1340 # If so, move the '_place' up the "<small>8 April 2012</small>" # html so that it will get caught at the next condition reviewer=None _review_place=_reviewer.next.next else: reviewer=clean_unicode(_reviewer) _review_place=_reviewer.next.next.next if hasattr(_review_place,'name') and _review_place.name == 'small': # this happens when there is no place. # If so, skip on to date # For an example of this ... review_place = None _date = _review_place.next else: m = re.match('from (.+)', _review_place) groups=m.groups() assert len(groups)==1 review_place = groups[0] review_place=review_place _date=_review_place.next.contents[1].next date=str(_date) date=datetime.datetime.strptime(date,'%d %B %Y') _review_text=_date.next.next.next.next imdb_review_text=get_html_text(_review_text) if imdb_review_text=='*** This review may contain spoilers ***.': spoilers=True _review_text=_review_text.next.next.next.next imdb_review_text=get_html_text(_review_text) else: spoilers=False d=dict(review_title=review_title, date=date, review_score=review_score, reviewer=reviewer, review_place=review_place, imdb_review_text=imdb_review_text, spoilers=spoilers, num_likes = num_likes, num_dislikes = num_dislikes, imdb_movie_id=self.imdb_movie_id, imdb_reviewer_id=imdb_reviewer_id, imdb_review_ranking=self.imdb_review_ranking_counter, imdb_review_url=imdb_review_url) self.imdb_review_ranking_counter+=1 return d
def get_review_from_page(self, review_soup, imdb_review_url): """ Pull out a single review form an IMDB movie review page. review is a soup object anchored on a reviewer's avatar. """ # Most reviews begin with the text # > "XXX out of XXX found the following review useful:" # we have to back up to find it, but sometimes it doesn't exist _quality_of_review = review_soup.previous.previous.previous.previous m = re.match( '(\d+) out of (\d+) people found the following review useful:', str(_quality_of_review)) if m is not None and len(m.groups()) == 2: groups = m.groups() num_likes = int(groups[0]) num_dislikes = int(groups[1]) - int(groups[0]) else: num_likes = num_dislikes = None _title = review_soup.next.next.next review_title = clean_unicode(_title) # the next thing to look for is the review score. # Note that this doesn't not always exist: review_image = _title.next.next if review_image.name == 'img': _review_score = _title.next.next.attrs['alt'] review_score = _review_score.split('/') assert review_score[0].isdigit() and review_score[1].isdigit() review_score = [int(review_score[0]), int(review_score[1])] assert review_score[0] in range(1, 11) assert review_score[1] == 10 review_score = review_score[0] _reviewer = _title.next.next.next.contents[3].next else: # No user review, jump to reviewer review_score = None _reviewer = _title.next.next.next.next.next.next reviewer_url = _reviewer.previous['href'] m = re.match('/user/ur(\d+)/', reviewer_url) groups = m.groups() assert len(groups) == 1 imdb_reviewer_id = int(groups[0]) if _reviewer == ' ': # for some reason, I think some reviewers don't have # a reviewer name. I found this problem here: # http://www.imdb.com/title/tt1408101/reviews?start=120 reviewer = None _review_place = _reviewer.next.next elif hasattr(_reviewer, 'name') and _reviewer.name == 'br': # this happens when there is no reviewer and no place! # This happend at: http://www.imdb.com/title/tt1392170/reviews?start=1340 # If so, move the '_place' up the "<small>8 April 2012</small>" # html so that it will get caught at the next condition reviewer = None _review_place = _reviewer.next.next else: reviewer = clean_unicode(_reviewer) _review_place = _reviewer.next.next.next if hasattr(_review_place, 'name') and _review_place.name == 'small': # this happens when there is no place. # If so, skip on to date # For an example of this ... review_place = None _date = _review_place.next else: m = re.match('from (.+)', _review_place) groups = m.groups() assert len(groups) == 1 review_place = groups[0] review_place = review_place _date = _review_place.next.contents[1].next date = str(_date) date = datetime.datetime.strptime(date, '%d %B %Y') _review_text = _date.next.next.next.next imdb_review_text = get_html_text(_review_text) if imdb_review_text == '*** This review may contain spoilers ***.': spoilers = True _review_text = _review_text.next.next.next.next imdb_review_text = get_html_text(_review_text) else: spoilers = False d = dict(review_title=review_title, date=date, review_score=review_score, reviewer=reviewer, review_place=review_place, imdb_review_text=imdb_review_text, spoilers=spoilers, num_likes=num_likes, num_dislikes=num_dislikes, imdb_movie_id=self.imdb_movie_id, imdb_reviewer_id=imdb_reviewer_id, imdb_review_ranking=self.imdb_review_ranking_counter, imdb_review_url=imdb_review_url) self.imdb_review_ranking_counter += 1 return d