/
scraper_intro.py
126 lines (107 loc) · 4.48 KB
/
scraper_intro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
import mechanize
from bs4 import BeautifulSoup, Tag, NavigableString
from data import Author, Book
def scrape(url):
'''
Scrape the HTML from a given url
'''
try:
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_redirect(True)
br.set_handle_referer(True)
# Cheat to win
br.addheaders = [
('User-agent',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
]
br.open(url.encode('utf8'), timeout=5)
html = br.response().read()
return html
except Exception, e:
print "Failed to scrape %s" % repr(url), e
return None
def goodreads_scrape(url):
'''
Input: Goodreads search url
Output: list of books and number of pages of results
'''
html = scrape(url)
if not html:
return [], 1
soup = BeautifulSoup(html)
# Look for the "previous page" link to find the number of result pages
num_pages = 1
previous_page = soup.find("span", attrs={"class": "previous_page"})
if previous_page and previous_page.parent:
for page_link in previous_page.parent.find_all("a"):
link_number = page_link.text.strip()
try:
num_pages = max(num_pages, int(link_number))
# print "We found %d pages" % num_pages
except:
pass
# look for all the table rows listed as books
books = []
for tr in soup.find_all("tr", attrs={"itemtype": "http://schema.org/Book"}):
# Find the book's title from the "bookTitle" link
title_box = tr.find("a", attrs={"class": "bookTitle"})
title = title_box.find('span').text
if title_box.has_attr('href'):
link = "http://goodreads.com" + title_box['href']
else:
link = "/"
# Look for the date in the "published by" text but fallback with 0
# This is b/c it's usually old/weird results that don't have this date field.
try:
small_text = tr.find("span", attrs={'class': 'greyText smallText uitext'})
date = re.search('\s\d\d\d\d\s', small_text.text).group().strip()
date = int(date)
except:
date = 0
books.append((date, title, link))
return books, num_pages
def scrape_author(author, earliest_year=0):
'''
This function searches goodreads.com for the given author name and compiles all results.
If earliest_year is specified, only return books with a publication year >= earliest_year.
Note: this isn't just an author search, so it may retrieve some literary criticism, etc.
'''
print "Scraping", author,
# Do our initial search without a page number specified, and then snag the "num_pages" data from that first search
url = "http://www.goodreads.com/search?tab=books&q=" + "+".join(author.split()).lower()
books, num_pages = goodreads_scrape(url)
# Now that we have a value for num_pages, keep scraping each page until we get them all
page_index = 1
while page_index < num_pages:
page_index += 1
print "Scraping %d/%d" % (page_index, num_pages),
# Our new url should specify which page it's targeting
new_url = "http://www.goodreads.com/search?page=%d&tab=books&q=%s" % (page_index, "+".join(author.split()).lower())
new_books, new_num_pages = goodreads_scrape(new_url)
# Add the recent scrape to our list of books and update the total num_pages if necessary
books += new_books
num_pages = max(num_pages, new_num_pages)
# Filter out books before our "earliest year" field and sort them by descending date
books = [book for book in books if book[0] >= earliest_year]
print "==> We found", len(books), "books by", author, "since", earliest_year
books = sorted(books, key=lambda x: x[0], reverse=True)
return books
def main():
for author in Author.objects:
books = scrape_author(author.name, author.year)
for book in books:
year = book[0]
title = book[1].encode('ascii', 'ignore')
link = book[2]
if not Book.objects(author=author.name, title=title, year=year).count():
Book(
author=author.name,
title=title,
link=link,
year=year,
read=False
).save()
if __name__ == "__main__":
main()