-
Notifications
You must be signed in to change notification settings - Fork 0
/
mongo.py
98 lines (75 loc) · 2.54 KB
/
mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pymongo
import requests
import os
import dataset
from bs4 import BeautifulSoup
from thready import threaded
from urlparse import urljoin
from pprint import pprint
from hashlib import sha1
BASE_URL = "http://raleigh.craigslist.org/"
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache')
"""Make a URL into a file name, using SHA1 hashes."""
def url_to_filename(url):
hash_file = sha1(url).hexdigest()+'.html'
return os.path,join(CACHE_DIR, hash_file)
"""Save a local copy of the file"""
def store_local(url, content):
if not os.path.isdir(CACHE_DIR):
os.makedirs(CACHE_DIR)
local_path = url_to_filename(url)
with open(local_path, 'wb') as f:
f.write(content)
"""Read a local copy of a url"""
def load_local(url):
local_path = url_to_filename(url)
if not os.path.exists(local_path):
return None
with open(local_path, 'rb') as f:
return f.read()
"""Wrap requests.get()"""
def get_content(url):
content = load_local(url)
if content is None:
response = requests.get(url)
content = response.content
store_local(url, content)
return contnet
"""Gets URLs for all current apartment CL adds"""
def scrape_cl_ads():
count = 0
response = requests.get(BASE_URL + "apa/")
soup = BeautifulSoup(response.content)
ads = soup.find_all('span', {'class':'pl'})
urls = []
for ad in ads:
link = ad.find('a').attrs['href']
url = urljoin(BASE_URL, link)
urls.append(url)
threaded(urls, scrape_cl_ad, num_threads=2)
print str(count) + "Ads inserted in CL"
""" Extract information from a apartment rental ad's page. """
def scrape_cl_ad(url):
response = requests.get(url)
soup = BeautifulSoup(response.content)
data = {
'source_url': url,
'subject': soup.find('h2', {'class':'postingtitle'}).text.strip(),
'body': soup.find('section', {'id':'postingbody'}).text.strip(),
'datetime': soup.find('time').attrs['datetime']
}
for meat in soup.find_all('p'):
if (meat).text.strip().startswith("post id"):
data['post_id'] = meat.text.strip()
export_to_mongo(data)
"""Export Document to Local Mongo Database"""
def export_to_mongo(data):
client = pymongo.MongoClient('localhost', 27017)
db = client.CL_Ads_url
collection = db.coll07292014
id_cache = data['post_id']
dist = db.coll07292014.distinct('post_id')
if id_cache not in dist:
doc_id = collection.insert(data)
if __name__ == '__main__':
scrape_cl_ads()