/
01-Movie-Scrape.py
214 lines (155 loc) · 5.42 KB
/
01-Movie-Scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Script: 01-Movie-Scrape.py
Purpose: Scrape data from boxofficemojo.com and match to data pulled from Open Mobie Database (OMDB) API
Input: Pages from boxofficemojo.com
Movie data pulled from OMDB API
Output: data/movie_data_raw_YYYY.pickle
"""
import re
import requests
from bs4 import BeautifulSoup
import omdb
import pickle
def scrape_value(soup, regex):
return soup.find(text=re.compile(regex)).findNextSibling().text.strip()
def clean_runtime(raw_value):
if raw_value.upper()!="N/A":
return int(raw_value.split()[0])*60+int(raw_value.split()[2])
else:
return np.nan
def clean_budget(raw_value):
if raw_value.upper()!="N/A":
value_list=raw_value.replace("$","").replace(",","").split()
if len(value_list)>1 and value_list[1]=="million": #previously checked, no billions
clean_value=float(value_list[0])*1000000
else:
clean_value=float(value_list[0])
return clean_value
else:
return np.nan
def inflation_adjust(year, raw_value): #Inflation adjusted calulated from BLS inflation calculator: http://www.bls.gov/data/inflation_calculator.htm
if year==2006:
adj_value=raw_value*0.969
elif year==2007:
adj_value=raw_value*0.942
elif year==2008:
adj_value=raw_value*0.907
elif year==2009:
adj_value=raw_value*0.910
elif year==2010:
adj_value=raw_value*0.896
elif year==2011:
adj_value=raw_value*0.868
elif year==2012:
adj_value=raw_value*0.851
elif year==2013:
adj_value=raw_value*0.838
elif year==2014:
adj_value=raw_value*0.825
elif year==2015:
adj_value=raw_value*0.824
else:
adj_value=raw_value
return adj_value
def scrape_movie(url):
soup=BeautifulSoup(requests.session().get(url).text, "lxml")
#Scrape Movie Title & Release Date
title=soup.find("title").text
title=title.split("(")[0].strip()
release_date=scrape_value(soup, "Release Date:")
#Convert Release Date to Python Datetime Object and Derive Release Month and Year
release_date=parse(release_date)
release_month=release_date.month
release_year=release_date.year
#Scrape Genre, Rating, Runtime, and Budget
genre=scrape_value(soup, "Genre:")
rating=scrape_value(soup, "Rating:")
runtime=scrape_value(soup, "Runtime:")
budget=scrape_value(soup, "Production Budget:")
#Clean-Up Runtime and Budget
runtime=clean_runtime(runtime)
budget=clean_budget(budget)
budget=inflation_adjust(release_year, budget)
#Scrape Number of Oscar Noms, Number Oscar Wins, and List of Oscar Nominations
url=url.replace("/movies/","/oscar/movies/")
soup=BeautifulSoup(requests.session().get(url).text, "lxml")
oscar_links=soup.find_all("a", href=True)
oscar_noms=0
oscar_wins=0
oscar_list=[]
for link in soup.find_all("a", href=True):
if link["href"].find("oscar/chart/")>0:
oscar=str(link.contents[0].encode('utf-8'))
if re.search("View All", oscar.title())==None:
oscar_noms+=1
if re.search("(WIN)", oscar.upper())>0:
oscar_wins+=1
oscar_list.append(oscar)
#Pull in Rotten Tomato Ratings & Plot Description from OMDB API
search_results=omdb.search_movie(title)
imdb_id=None
for result in search_results:
if int(result["year"])==release_year:
imdb_id=result["imdb_id"]
break
if imdb_id != None:
omdb_content=omdb.imdbid(imdb_id, tomatoes=True)
metascore=omdb_content["metascore"]
imdb_rating=omdb_content["imdb_rating"]
tomato_meter=omdb_content["tomato_meter"]
tomato_user_meter=omdb_content["tomato_user_meter"]
plot=omdb_content["plot"]
else:
metascore=None
imdb_rating=None
tomato_meter=None
tomato_user_meter=None
plot=None
#Store Data for One movie in dictionary
keys=["title","release_date","release_year","genre","rating","runtime","budget","imdb_id","metascore","imdb_rating","tomato_meter","tomato_user_meter","plot","oscar_noms","oscar_wins"]
values=[title,release_date,release_year,genre,rating,runtime,budget,imdb_id,metascore,imdb_rating,tomato_meter,tomato_user_meter,plot,oscar_noms,oscar_wins]
d=dict(zip(keys,values))
return d
#print scrape_movie("http://www.boxofficemojo.com/movies/?id=mariesstory.htm")
#Scrape Movie and Page Links from Yearly URL Index
def scrape_links(url,lookup):
soup=BeautifulSoup(requests.session().get(url).text, "lxml")
links=set()
for link in soup.find_all("a", href=True):
if link["href"].find(lookup)>0:
full_link="http://www.boxofficemojo.com"+link["href"]
links.add(full_link)
links=list(links)
links.sort()
return links
#Scrape From All Year in Analysis Period
def scrape_years(start_year,stop_year):
for yr in range(start_year,(stop_year+1)):
data=[]
print "Starting Scape for: "+str(yr)
year_link="http://www.boxofficemojo.com/yearly/chart/?yr="+str(yr)+"&p=.htm"
movie_links=scrape_links(year_link,"movies/?id=")
for movie_link in movie_links:
try:
row=scrape_movie(movie_link)
data.append(row)
print "Scraped: "+row["title"], row["release_year"]
except:
print "SCRAPE FAILED: "+movie_link
page_links=scrape_links(year_link,"page=")
for page_link in page_links:
movie_links=scrape_links(page_link,"movies/?id=")
for movie_link in movie_links:
try:
row=scrape_movie(movie_link)
data.append(row)
print "Scraped: "+row["title"], row["release_year"]
except:
print "SCRAPE FAILED: "+movie_link
print str(yr)+" Pickle Output: "+str(len(data))
#Save data as Pickle
out_pickle="data/movie_data_raw_"+str(yr)+".pickle"
with open(out_pickle, "wb") as f:
pickle.dump(data, f)
#Scrape Analysis Years in Batches
scrape_years(2005,2015)