def getTop100Actors(): # All the names of the actors gather throughout this method # will be put into this list and be returned. actorsInTop100 = [] # Get the first 50 people mojoTopActors50Page = film_industry_http.getPage( 'http://www.boxofficemojo.com/people/?view=Actor&pagenum=1&sort=sumgross&order=DESC&&p=.htm' ) # If we can't get any response from the url above, return what we have so far (= empty). if mojoTopActors50Page == None: return actorsInTop100 # Parse the response into tree mojoTopActors50Tree = lxml.html.parse(mojoTopActors50Page) # Get the rows of the tables that contains the names of actors # Get the iterator of the list and use next() to skip the first one since # that would be the header (not an actual actor). mojoTopActors50Iterator = iter( mojoTopActors50Tree.xpath( '//table[contains(tr[1]/td[1]/font, "Row")]//tr')) next(mojoTopActors50Iterator) # Skip the header # Names are in the 2nd column of each row. for actor in mojoTopActors50Iterator: actorsInTop100.append(str.lower(actor.xpath('string(.//td[2])'))) # Move on to other 50 # At the moment, the page actually gives us names up to #150, not #100. # We just get 150 people because it doesn't really matter for our purpose. mojoTopActors100Page = film_industry_http.getPage( 'http://www.boxofficemojo.com/people/?view=Actor&pagenum=2&sort=sumgross&order=DESC&&p=.htm' ) # If we can't get any response from the url above, return what we have so far (= first 50). if mojoTopActors100Page == None: return actorsInTop100 # Parse the response into tree mojoTopActors100Tree = lxml.html.parse(mojoTopActors100Page) # Get the rows of the tables that contains the names of actors # Get the iterator of the list and use next() to skip the first one since # that would be the header (not actual actor). mojoTopActors100Iterator = iter( mojoTopActors100Tree.xpath( '//table[contains(tr[1]/td[1]/font, "Row")]//tr')) next(mojoTopActors100Iterator) # Names are in the 2nd column of each row. for actor in mojoTopActors100Iterator: actorsInTop100.append(str.lower(actor.xpath('string(.//td[2])'))) # Return the list return actorsInTop100
def getAllMovieData(): yearStart = film_industry_settings.yearStart yearEnd = film_industry_settings.yearEnd allMovieDataList = [] mojoIndexPage = film_industry_http.getPage('http://www.boxofficemojo.com/movies/') if mojoIndexPage == None: return allMovieDataList mojoIndexTree = lxml.html.parse(mojoIndexPage) # Get list of links to alphabets alphabetLinksTable = mojoIndexTree.xpath('//tr[starts-with(td[1]/font/b/a/text(), "#")]') alphabetLinks = alphabetLinksTable[0].xpath('.//td//a') alphabetSubLinksList = [] for alpha in alphabetLinks: alpha.make_links_absolute() mojoAlphabetPage = film_industry_http.getPage(alpha.xpath('.//@href')[0]) if mojoAlphabetPage == None: break mojoAlphabetTree = lxml.html.parse(mojoAlphabetPage) # Deal with subpages alphaSubPageCount = len(mojoAlphabetTree.xpath('//div[@id = \'body\']//div[@class = \'alpha-nav-holder\'][1]/font//b')) for i in range(alphaSubPageCount): alphabetSubLinksList.append(alpha.xpath('.//@href')[0] + '&page=' + str(i+1)) for alphaHref in alphabetSubLinksList: allMovieDataList = allMovieDataList + processMojoAlphabet(alphaHref, yearStart, yearEnd) # Show the total number of movies fetched print ('Total number of movies fetched: ' + str(len(allMovieDataList))) return allMovieDataList
def getTop100Directors(): # All the names of the directors gather throughout this method # will be put into this list and be returned. directorsInTop100 = [] mojoTopDirectors50Page = film_industry_http.getPage( 'http://www.boxofficemojo.com/people/?view=Director&pagenum=1&sort=sumgross&order=DESC&&p=.htm' ) if mojoTopDirectors50Page == None: return directorsInTop100 mojoTopDirectors50Tree = lxml.html.parse(mojoTopDirectors50Page) mojoTopDirectors50Iterator = iter( mojoTopDirectors50Tree.xpath( '//table[contains(tr[1]/td[1]/font, "Row")]//tr')) next(mojoTopDirectors50Iterator) # Skip the header for director in mojoTopDirectors50Iterator: directorsInTop100.append(str.lower(director.xpath('string(.//td[2])'))) mojoTopDirectors100Page = film_industry_http.getPage( 'http://www.boxofficemojo.com/people/?view=Director&pagenum=2&sort=sumgross&order=DESC&&p=.htm' ) if mojoTopDirectors100Page == None: return directorsInTop100 mojoTopDirectors100Tree = lxml.html.parse(mojoTopDirectors100Page) mojoTopDirectors100Iterator = iter( mojoTopDirectors100Tree.xpath( '//table[contains(tr[1]/td[1]/font, "Row")]//tr')) next(mojoTopDirectors100Iterator) for director in mojoTopDirectors100Iterator: directorsInTop100.append(str.lower(director.xpath('string(.//td[2])'))) return directorsInTop100
def processMojoAlphabet(alphabetLink, yearStart, yearEnd): movieDataList = [] mojoAlphabetPage = film_industry_http.getPage(alphabetLink) if mojoAlphabetPage == None: return None mojoAlphabetTree = lxml.html.parse(mojoAlphabetPage) mojoAlphabetMovieTable = mojoAlphabetTree.xpath('//table[contains(tr[1]/td[1]/font, "Title (click to view box office)")]//tr') for movieRow in mojoAlphabetMovieTable: if len(movieRow) < 7: continue # The page lists release dates on 7th column of the table. We try to check # this value and skip the movie if it falls outside our search range. try: timeMovie = movieRow.xpath('./td[7]/font/a') if len(timeMovie) == 0: timeMovieString = movieRow.xpath('./td[7]/font')[0].text else: timeMovieString = timeMovie[0].text timeMovieRelease = time.strptime(timeMovieString, "%m/%d/%Y") except ValueError: continue except IndexError: continue # Check the release date column and ignore ones that fall outside our year range if timeMovieRelease.tm_year < yearStart: continue if timeMovieRelease.tm_year > yearEnd: continue movieLinkTag = movieRow.xpath('./td[1]/font/a')[0] movieLinkTag.make_links_absolute() movieLink = movieLinkTag.xpath('./@href')[0] movieLinkInfo = processMojoMovie(movieLink) if movieLinkInfo == None: continue else: movieDataList.append(movieLinkInfo) return movieDataList
def getFranchiseMovies(franchiseHref): movieInFranchise = [] mojoFranchisePage = film_industry_http.getPage(franchiseHref) if mojoFranchisePage == None: return movieInFranchise mojoFranchiseTree = lxml.html.parse(mojoFranchisePage) # Exclude the last 2 rows of tables franchiseMovieList = mojoFranchiseTree.xpath( '//table[contains(tr[1]/td[1]/font/a/text(), "Rank")]//tr') franchiseMovieList = franchiseMovieList[1:len(franchiseMovieList) - 2] for movie in franchiseMovieList: movieInFranchise.append(str.lower(movie.xpath('string(.//td[2])'))) return movieInFranchise
def getFranchises(): franchisesList = [] mojoFranchisesPage = film_industry_http.getPage( 'http://www.boxofficemojo.com/franchises/') if mojoFranchisesPage == None: return franchisesList mojoFranchisesTree = lxml.html.parse(mojoFranchisesPage) franchiseLinks = mojoFranchisesTree.xpath( '//table[contains(tr[1]/td[1]/font/a/b/text(), "Franchise (click to view chart)")]//tr' ) franchiseLinks = iter(franchiseLinks) next(franchiseLinks) for franchise in franchiseLinks: franchise.make_links_absolute() franchiseHref = franchise.xpath('.//@href')[0] franchisesList = franchisesList + getFranchiseMovies(franchiseHref) return franchisesList
def processMojoMovie(movieLink): movieValue = {} mojoMoviePage = film_industry_http.getPage(movieLink) if mojoMoviePage == None: return None mojoMovieTree = lxml.html.parse(mojoMoviePage) # Get title titleString = mojoMovieTree.xpath('string(/html/head/title)') titleString = re.sub(r'\s*\([0-9]{1,4}\)\s*-\s*Box\s*Office\s*Mojo', '', titleString) titleOMDbSearchString = re.sub(r'\s*\([0-9]{1,4}\)', '', titleString) # this is just for OMDb search movieValue['title'] = titleString # Get distributor, release date, genre, runtime, rating, and budget from the upper table # Distributor distributorString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Distributor: ")]/td[1])').replace('Distributor: ', '', 1) movieValue['distributor'] = distributorString # Release Date releaseDateString = mojoMovieTree.xpath('string(//tr[starts-with(td[2], "Release Date: ")]/td[2])').replace('Release Date: ', '', 1) yearOMDbSearchString = releaseDateString.split()[-1] movieValue['releaseDate'] = releaseDateString # Genre genreString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Genre: ")]/td[1])').replace('Genre: ', '', 1) movieValue['genre'] = genreString # Budget budgetString = mojoMovieTree.xpath('string(//tr[starts-with(td[2], "Production Budget: ")]/td[2])').replace('Production Budget: ', '', 1) movieValue['budget'] = budgetString # Runtime runtimeString = mojoMovieTree.xpath('string(//tr[starts-with(td[2], "Runtime: ")]/td[2])').replace('Runtime: ', '', 1) movieValue['runtime'] = runtimeString # Rating ratingString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "MPAA Rating: ")]/td[1]/b)') movieValue['rating'] = ratingString # Get US gross revenue, widest release, opening weekend data from the summary page below # US Gross Revenue USGrossString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Domestic:")]/td[2])').replace(u'\xa0', '', 1).replace(u'$', '', 1) movieValue['USGross'] = USGrossString # Widest Releases widestReleasesString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Widest' + u'\xa0' +'Release:")]/td[2])').replace(u'\xa0','',1).replace(' theaters', '', 1) movieValue['widestReleases'] = widestReleasesString # Opening Weekend values # Revenue revenueOpeningString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Wide' + u'\xa0' + 'Opening' + u'\xa0' + 'Weekend:")]/td[2])').replace(u'\xa0', '', 1).replace(u'$', '', 1) # If there's no separate 'Wide Opening Weekend' section, go for just 'Opening Weekend' if len(revenueOpeningString) == 0: # Opening Weekend Revenue revenueOpeningString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Opening' + u'\xa0' + 'Weekend:")]/td[2])').replace(u'\xa0', '', 1).replace(u'$', '', 1) # Get opening weekend information that are written right below opening weekend revenues openingWeekendInfo = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Opening' + u'\xa0' + 'Weekend:")]/following-sibling::tr/td[1])').replace('(','',1).replace(')','',1) # Since there was no separate 'wide opening', this movie is not 'platform release' movieValue['platform'] = 'false' else: # Get opening weekend information that are written right below opening weekend revenues (wide) openingWeekendInfo = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Wide' + u'\xa0' + 'Opening' + u'\xa0' + 'Weekend:")]/following-sibling::tr/td[1])').replace('(','',1).replace(')','',1) # If there is a 'Wide Opening Weekend' section, that movie must have had a 'platform release', meaning # it had a limited release before its wide release. # Then check for 'Limited Opening Weekend' as well, and record the values recorded for that section as well. # Get Opening Revenue and Info for 'Limited Opening Weekend' revenueOpeningLimitedString = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Limited' + u'\xa0' + 'Opening' + u'\xa0' + 'Weekend:")]/td[2])').replace(u'\xa0', '', 1).replace(u'$', '', 1) openingWeekendLimitedInfo = mojoMovieTree.xpath('string(//tr[starts-with(td[1], "Limited' + u'\xa0' + 'Opening' + u'\xa0' + 'Weekend:")]/following-sibling::tr/td[1])').replace('(','',1).replace(')','',1) # Since there is separate 'wide opening', this movie IS 'platform release' movieValue['platform'] = 'true' # Record Opening Weekend Revenue (Wide) movieValue['openingRevenue'] = revenueOpeningString # Parse Opening Weekend Info String (Wide) # Opening Weekend Ranking (Wide) openingWeekendRank = re.search(r'#(\d*[,]*)*\d+\s*rank', openingWeekendInfo) if openingWeekendRank: movieValue['openingRank'] = openingWeekendRank.group().replace('#','',1).replace(' rank', '', 1) else: movieValue['openingRank'] = '' # Opening Weekend Theaters (Wide) openingWeekendTheater = re.search(r'(\d*[,]*)*\d+\s*theaters', openingWeekendInfo) if openingWeekendTheater: movieValue['openingTheater'] = openingWeekendTheater.group().replace(' theaters', '', 1) else: movieValue['openingTheater'] = '' # Opening Average (Wide) openingWeekendAverage = re.search(r'\$(\d*[,]*)*\d+\s*average', openingWeekendInfo) if openingWeekendAverage: movieValue['openingAverage'] = openingWeekendAverage.group().replace('$', '', 1).replace(' average', '', 1) else: movieValue['openingAverage'] = '' # Parse 'Limited' Opening Weekend Info (if they exist) if movieValue['platform'] == 'true': # Record Opening Weekend Revenue (Limited) movieValue['openingRevenueLimited'] = revenueOpeningLimitedString # Opening Weekend Ranking (Limited) openingWeekendRankLimited = re.search(r'#(\d*[,]*)*\d+\s*rank', openingWeekendLimitedInfo) if openingWeekendRankLimited: movieValue['openingRankLimited'] = openingWeekendRankLimited.group().replace('#','',1).replace(' rank', '', 1) else: movieValue['openingRankLimited'] = '' # Opening Weekend Theaters (Limited) openingWeekendTheaterLimited = re.search(r'(\d*[,]*)*\d+\s*theaters', openingWeekendLimitedInfo) if openingWeekendTheaterLimited: movieValue['openingTheaterLimited'] = openingWeekendTheaterLimited.group().replace(' theaters', '', 1) else: movieValue['openingTheaterLimited'] = '' # Opening Average (Limited) openingWeekendAverageLimited = re.search(r'\$(\d*[,]*)*\d+\s*average', openingWeekendLimitedInfo) if openingWeekendAverageLimited: movieValue['openingAverageLimited'] = openingWeekendAverageLimited.group().replace('$', '', 1).replace(' average', '', 1) else: movieValue['openingAverageLimited'] = '' else: # Since 'platform' is 'false', there is no 'limited opening weekend' section. movieValue['openingRevenueLimited'] = '' movieValue['openingRankLimited'] = '' movieValue['openingTheaterLimited'] = '' movieValue['openingAverageLimited'] = '' # Get Number of Weeks numberWeeksString = mojoMovieTree.xpath('//tr[starts-with(td[1]/font/a/b, "> View All")]/td[1]/font/a/b/text()') if len(numberWeeksString) == 0: movieValue['numberWeeks'] = '' else: for match in numberWeeksString: # On each movie page, there is a link that says '> View All (number) Weekends'. # We are trying to extract that number, which is a number of weeks for each movie. if match.find('Weekends') != -1: numberWeeksSearch = re.search(r'(\d*[,]*)*\d+\s*Weekends', match) # In some movies, the text is just '> View All Weekends' (no number between 'All' and 'Weekends'), # which means the movie was released for just 2 weeks (or less). if not numberWeeksSearch: movieValue['numberWeeks'] = 2 else: movieValue['numberWeeks'] = numberWeeksSearch.group().replace(' Weekends', '', 1) break # If the movie had a platform release, get the number of weeks under platform release as well. try: if movieValue['platform'] == 'true': numberWeeksLimitedString = mojoMovieTree.xpath('//tr[starts-with(td[1], "Release' + u'\xa0' + 'Dates:")]/td[2]/b/a/text()') limitedReleaseDate = datetime.datetime.strptime(numberWeeksLimitedString[0], "%B %d, %Y") wideReleaseDate = datetime.datetime.strptime(numberWeeksLimitedString[1], "%B %d, %Y") numberDaysLimitedWide = (wideReleaseDate - limitedReleaseDate).days movieValue['numberWeeksLimited'] = numberDaysLimitedWide / 7 else: movieValue['numberWeeksLimited'] = '' except NameError, IndexError: movieValue['numberWeeksLimited'] = ''