/
imdb.py
200 lines (160 loc) · 6.89 KB
/
imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import sys
import re
from bs4 import BeautifulSoup
import getopt
import urllib.request
BASE_URL = "http://www.imdb.com"
topUrl = BASE_URL+"/chart/top"
# The main procedure
def main(argv):
uYear = None
nrActors = None
opts = ''
beauty = False
try:
# Get user arguments, g is how many users to get and y is filter for
# after which year we will be returning our result
opts, args = getopt.getopt(argv, "g:y:bh", ["get=", "year=", "beauty", "help"])
except getopt.GetoptError:
pass
for opt, arg in opts:
if opt in ("-g", "--get"):
nrActors = int(arg)
if opt in ("-y", "--year"):
uYear = int(arg)
if opt in ("-b", "--beauty"):
beauty = True
if opt in ("-h", "--help"):
printHelp()
sys.exit(0)
print("This program might take a while...Please wait")
# Grab all movies in the top250 list
MovieList = getMovieList(uYear)
print("Grabed the movie list")
print("Starting collecting actors from each movie")
# Grab all the actors in each film,
# list will be in the form of {movieID: {actorID: name, actorID: name....}, movieID: {...]...}
allActors = getMoviesActors(MovieList)
print("Grabed the actors list from each movie")
print("Starting sorting the data")
# Create an empty list to fill with the name of the actor and
# number of roles in the top250 movies
actorSet = {}
for movie in allActors:
for actor in allActors[movie]:
# If actorID was in actorSet then raise his role number by 1
if actor in actorSet:
actorSet[actor]["count"] += 1
else:
# If the actor has not been added before we add him include raise his count by 1
actorSet[actor] = {"name": allActors[movie][actor], "count":1}
# Now if the actorset was filled with information we should only grap
# actors name and his count
if actorSet:
finalList = [(actorSet[x]["name"],actorSet[x]["count"]) for x in actorSet]
# Sort the list. First by how many roles each user has playd in
# then by his name.
finalSorted = sorted(finalList, key=lambda x: (-x[1], x[0]) )
# We allow the user to print the list with some style
if beauty:
if nrActors:
for x in finalSorted[:nrActors]:
print("%s has played in %d movies" % (x[0],x[1]))
else:
for x in finalSorted:
print("%s has played in %d movies" % (x[0],x[1]))
# If the user has argument of how many users should be printed
# we print out actors based on his input
# else print all the list
else:
if nrActors:
for x in finalSorted[:nrActors]:
print(x[1],x[0])
else:
for x in finalSorted:
print(x[1],x[0])
def printHelp():
output = '''
This program prints out the numbersof roles each actor has played in the top 250 movies
on IMDB (The Internet Movie Database).
You can choose between two formats, both containing the actors name and how many movies
he as playd.
The formats are:
--- ActorsName has played in # movies
--- # ActorsName
Please make notice, this program takes a good time to run through large movie set
so please be patience.
Options:
-h, --help
Help message
-g, --get
How many actors to print out
-y, --year
From what year shall the program print out actors, the year of the films
-b, --beauty
Prints out: ActorsName has played in # movies
instead of: # ActorsName
'''
print(output)
def getMoviesActors(movieList):
"""
:param A list containing formatted movie list
:return: A list containing ID of the movie and all actors in that movie including actors ID
"""
actorsInMovies = {}
for x in movieList:
req = urllib.request.Request(BASE_URL+movieList[x]["Url"]+"/fullcredits")
#print(req.full_url)
# Header is necessary to get the right movie titles, as in the english title
req.add_header('Accept-Language', 'en-US,en')
# Send the request and get response
response = urllib.request.urlopen(req)
bsoup = BeautifulSoup(response)
findCastList = bsoup.find("table", {"class": "cast_list"})
findAllActors = findCastList.findAll("td", itemprop="actor")
actors = {}
for d in findAllActors:
actorName = d.find("span", itemprop="name")
actorNumber = d.find("a", href=re.compile("\/name\/nm"))
actorID = re.match("(?:\/name\/nm)(?P<userid>\d+)", actorNumber["href"]).group("userid")
actors[actorID] = actorName.contents[0]
actorsInMovies[movieList[x]["ID"]] = actors
return actorsInMovies
def getMovieList(uYear):
"""
Get the list of all movies with all usefull information, such as TT number
and link wich will be used elsewhere.
:return: THe full list containing all information needed to evaluate the list
"""
# Open a request to the IMDb webpage
req = urllib.request.Request(topUrl)
# Header is necessary to get the right movie titles, as in the english title
req.add_header('Accept-Language', 'en-US,en')
# Send the request and get response
response = urllib.request.urlopen(req)
# Use BeautifulSoup to manipulate the html
bsoup = BeautifulSoup(response)
# Movie list is in the tbody tag and in class lister-list
findMovieList = bsoup.find("tbody", {"class": "lister-list"})
# The information we are collecting is in a link tag ("a") containing the TT number and the title
# of the movie
#getMoviesList = findMovieList.findAll("a", href=re.compile("\/title\/tt"), title=re.compile(".+"))
getMoviesList = findMovieList.findAll("td", {"class": "titleColumn"})
# Empty return list
MovieList = {}
# For best result we start with number one
for i, movieid in enumerate(getMoviesList, start=1):
movieUrl = re.match(".+\/tt\d+", movieid.find("a", title=re.compile(".+"))["href"]).group(0)
movieNumber = re.match("(?:\/title\/tt)(?P<movieid>\d+)", movieid.find("a", title=re.compile(".+"))["href"]).group("movieid")
extractedYear = movieid.find("span", {"class": "secondaryInfo", "name":"rd"}).contents[0]
movieYear = int(re.match("(?:\()(?P<movieYear>\d+)(?:\))", extractedYear).group("movieYear"))
movieTitle = movieid.find("a", href=re.compile(".+"), title=re.compile(".+")).contents[0]
if uYear and movieYear>uYear:
MovieList[i] = {"ID":movieNumber,"Title":movieTitle, "Url":movieUrl}
elif uYear and movieYear<uYear:
continue
else:
MovieList[i] = {"ID":movieNumber,"Title":movieid.contents[0], "Url":movieUrl}
return MovieList
if __name__ == '__main__':
main(sys.argv[1:])