-
Notifications
You must be signed in to change notification settings - Fork 6
/
03-multithreading.py
143 lines (97 loc) · 3.84 KB
/
03-multithreading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from urlparse import urljoin
from thready import threaded
import dataset
import os
from hashlib import sha1
# A list of missed connections in New York
BASE_URL = 'http://newyork.craigslist.org/'
# connect to our database
db = dataset.connect('sqlite:///missed_connections.db')
# a directory for caching file's we've already downloaded
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache')
def url_to_filename(url):
""" Make a URL into a file name, using SHA1 hashes. """
# use a sha1 hash to convert the url into a unique filename
hash_file = sha1(url).hexdigest() + '.html'
return os.path.join(CACHE_DIR, hash_file)
def store_local(url, content):
""" Save a local copy of the file. """
# If the cache directory does not exist, make one.
if not os.path.isdir(CACHE_DIR):
os.makedirs(CACHE_DIR)
# Save to disk.
local_path = url_to_filename(url)
with open(local_path, 'wb') as f:
f.write(content)
def load_local(url):
""" Read a local copy of a URL. """
local_path = url_to_filename(url)
if not os.path.exists(local_path):
return None
with open(local_path, 'rb') as f:
return f.read()
def get_content(url):
""" Wrap requests.get() """
content = load_local(url)
if content is None:
response = requests.get(url)
content = response.content
store_local(url, content)
return content
def scrape_missed_connections():
""" Scrape all the missed connections from a list """
# Download the list of missed connections
# here were using requests,
# a python library for accessing the web
# we add "mis/" to the url to tell requests
# to get the missed connections
# on newyork.craigslist.org
response = requests.get(BASE_URL + "mis/")
# parse HTML using Beautiful Soup
# this returns a `soup` object which
# gives us convenience methods for parsing html
soup = BeautifulSoup(response.content)
# find all the posts in the page.
# here we're telling BeautifulSoup to get us every
# span tag that has a class that equals pl
# these tags might look something like this:
# <span class='pl'> {content} </span>
missed_connections = soup.find_all('span', {'class':'pl'})
# create an empty list of urls to scrape
urls = []
for missed_connection in missed_connections:
# for each span list, find the "a" tag which
# represents the link to the missed connection page.
link = missed_connection.find('a').attrs['href']
# join this relative link with the
# BASE_URL to create an absolute link
url = urljoin(BASE_URL, link)
# iteratively populate this list
urls.append(url)
# download and parse these missed connections using
# multiple threads
threaded(urls, scrape_missed_connection, num_threads=10)
def scrape_missed_connection(url):
""" Extract information from a missed connections's page. """
# log the url we're scraping
print "scraping %s ..." % url
# retrieve the missed connection with requests
response = get_content(url)
# Parse the html of the missed connection post
soup = BeautifulSoup(response.content)
# Extract the actual contents of some HTML elements:
# here were using BeautifulSoup's `text` method for retrieving
# the plain text within each HTML element.
data = {
'source_url': url,
'subject': soup.find('h2', {'class':'postingtitle'}).text.strip(),
'body': soup.find('section', {'id':'postingbody'}).text.strip(),
'datetime': soup.find('time').attrs['datetime']
}
# Upsert the data into our database
db['posts'].upsert(data, ['source_url'])
if __name__ == '__main__':
scrape_missed_connections()