-
Notifications
You must be signed in to change notification settings - Fork 1
/
read_book.py
135 lines (71 loc) · 2.72 KB
/
read_book.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import math, random, csv, json, re
from bs4 import BeautifulSoup
import requests
from time import sleep
import matplotlib.pyplot as plt
from collections import Counter
def is_video(td):
"""it's a video if it has exactly one pricelabel, and if
Scraping the Web | 111
www.it-ebooks.info
the stripped text inside that pricelabel starts with 'Video'"""
pricelabels = td('span', 'pricelabel')
return (len(pricelabels) == 1 and
pricelabels[0].text.strip().startswith("Video"))
print len([td for td in tds if not is_video(td)])
def book_info(td):
"""given a BeautifulSoup <td> Tag representing a book,
extract the book's details and return a dict"""
title = td.find("div", "thumbheader").a.text
by_author = td.find('div', 'AuthorName').text
authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
isbn_link = td.find("div", "thumbheader").a.get("href")
isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
date = td.find("span", "directorydate").text.strip()
return {
"title" : title,
"authors" : authors,
"isbn" : isbn,
"date" : date
}
base_url = "http://shop.oreilly.com/category/browse-subjects/" + \
"data.do?sortby=publicationDate&page="
books = []
NUM_PAGES = 20 # at the time of writing, probably more by now
for page_num in range(1, NUM_PAGES + 1):
print "souping page", page_num, ",", len(books), " found so far"
url = base_url + str(page_num)
print(url)
soup = BeautifulSoup(requests.get(url).text, 'html5lib')
for td in soup('td', 'thumbtext'):
if not is_video(td):
books.append(book_info(td))
# now be a good citizen and respect the robots.txt!
sleep(2)
print books
print len(books)
def get_year(book):
"""book["date"] looks like 'November 2014' so we need to
split on the space and then take the second piece"""
return int(book["date"].split()[1])
# 2014 is the last complete year of data (when I ran this)
year_counts = Counter(get_year(book) for book in books if get_year(book) <= 2017)
years = sorted(year_counts)
book_counts = [year_counts[year] for year in years]
plt.plot(years, book_counts)
plt.ylabel("# of data books")
plt.title("Data is Big!")
plt.show()
serialized = """{ "title" : "Data Science Book",
"author" : "Joel Grus",
"publicationYear" : 2014,
"topics" : [ "data", "science", "data science"] }"""
# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
if "data science" in deserialized["topics"]:
print deserialized
endpoint = "https://api.github.com/users/joelgrus/repos"
repos = json.loads(requests.get(endpoint).text)
for rows in repos:
data1=rows["name"]
print