forked from tigerskitchen/howmanyin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
howmanyin.py
51 lines (40 loc) · 1.77 KB
/
howmanyin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/python3
#This scraper is specific to IMDB's URLs
#Typical usage:
#python howmanyin.py url namevariation1 namevariation2 etc...
from bs4 import BeautifulSoup
import requests
import re
def search_movie_for_names(url, names):
"""
Search a movie on IMDB to see how many people in the credits match a given name.
:param credits_url: The URL for the movie you want to search
:param names: A list of the names you want to search for in the movie's credits
"""
# Get the unique numeric code for the IMDB movie
movie_id = re.sub('\D', '', url)
# Make sure we point to the credits page
credits_url = 'http://www.imdb.com/title/tt' + movie_id[:7] + '/fullcredits'
# Make sure the names are correctly capitalized
for i in range(len(names)):
names[i] = names[i].lower().capitalize()
r = requests.get(credits_url)
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')
movie_title = soup.find('a', class_='subnav_heading').string
# TODO: filter this specifically to the div with id 'fullcredits_content'
all_names = [i.string.strip() for i in links if i.string is not None]
# TODO: possibly rework this with a hash table
filtered_names = []
for name in all_names:
for name_to_match in names:
if name.startswith(name_to_match):
if name not in filtered_names:
filtered_names.append(name)
return movie_title, filtered_names
# Run the code
movie_url = 'http://www.imdb.com/title/tt0441773/fullcredits'
names_to_search = ['Matt', 'Matthew', 'Matti', 'Matty', 'Mat', 'Mathew']
movie_title, filtered_names = search_movie_for_names(movie_url, names_to_search)
print(str(movie_title) + ' has ' + str(len(filtered_names)) + '!!!')
print(filtered_names)