-
Notifications
You must be signed in to change notification settings - Fork 0
/
awards_scraper.py
78 lines (64 loc) · 2.5 KB
/
awards_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from scraper import Scraper
from bs4 import BeautifulSoup
import csv, re
awards_base_url = "http://data.newsday.com/long-island/data/entertainment/movies/oscar-winners-history/?q=&searchField=actorDirectorName&fieldSelect-result=&fieldSelect-category=&fieldSelect-year=¤tRecord="
class AwardScraper(Scraper):
def __init__(self, base_url = awards_base_url, search_url = ""):
Scraper.__init__(self, base_url, search_url)
self.file = open('academy_awards.csv', 'wb')
self.writer = csv.writer(self.file, delimiter='\t')
self.writer.writerow(['Year', 'Category', 'Won', 'FilmName', 'ActorDirectorName'])
self.soup = self.connect(base_url)
self.next_record = '1'
def scrape_page(self):
table = self.soup.find("table", { "id" : "sdb-results" })
trs = table.find_all("tr")
for i in range(1,len(trs)):
tr = trs[i]
year = self.get_year(tr)
category = self.get_category(tr)
won = self.get_won(tr)
film_name = self.get_film_name(tr)
actor_director_name = self.get_actor_director_name(tr)
row = [year, category, won, film_name, actor_director_name]
self.writer.writerow(row)
def get_year(self, row):
year = row.find('td', {'class': re.compile('year$')}).text
return int(year)
def get_category(self, row):
return row.find('td', {'class': re.compile('category$')}).text
def get_won(self, row):
won = row.find('td', {'class': re.compile('result$')}).text
return self.str2bool(won)
def get_film_name(self, row):
film_name = row.find('a', {'href': re.compile('(filmName)')}).text
return film_name.replace('\"', '')
def get_actor_director_name(self, row):
return row.find('a', {'href': re.compile('(actorDirectorName)')}).text
def get_next_page(self):
if self.next_record == '1001':
return '1051'
else:
select = self.soup.find('select', {'name': 'currentRecord'})
option = select.find('option', {'selected': 'SELECTED'})
next_option = option.findNextSibling()
if next_option:
return next_option['value']
else:
return None
def get_current_page(self):
select = self.soup.find('select', {'name': 'currentRecord'})
option = select.find('option', {'selected': 'SELECTED'})
return option['value']
def scrape_all(self):
while self.get_next_page():
self.next_record = self.get_next_page()
self.scrape_page()
self.soup = self.connect(awards_base_url+self.next_record)
self.soup = self.connect(awards_base_url+self.next_record)
self.scrape_page()
def main():
scraper = AwardScraper()
scraper.scrape_all()
if __name__ == '__main__':
main()