/
fetcher.py
92 lines (75 loc) · 2.85 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# coding: utf-8
from __future__ import absolute_import
import requests
from furl import furl
import os
import json
from common import Common
from redis import StrictRedis
redis = StrictRedis()
FETCH_SET = 'to_fetch'
SEEN_SET = 'fetched'
import logging
logging.basicConfig()
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
class Fetcher(Common):
def __init__(self, url, database):
self.url = furl(url)
self.database = database
if not os.path.isdir(database):
os.mkdir(database)
config_filename = os.path.join(database, 'config.json')
if os.path.isfile(config_filename):
config = json.load(open(config_filename))
if config['url'] != str(self.url):
raise Exception("Wrong base URL for this database")
else:
with open(config_filename, 'w') as fh:
json.dump(dict(
url = str(self.url)
), fh)
def run(self):
log.debug("Starting spider")
self.spider()
log.debug("Finished spider")
def spider(self):
import ipdb
url = None
with ipdb.launch_ipdb_on_exception():
try:
url = furl(redis.spop(FETCH_SET))
while url:
url = self.preprocess_url(url)
if not url or not str(url) or redis.sismember(SEEN_SET, str(url)):
# Already have this request stored (or don't want it)
log.debug('skipping: %s', url)
url = furl(redis.spop(FETCH_SET))
continue
response = self.request(url)
self.url_write(url, response)
redis.sadd(SEEN_SET, str(url))
try:
urls = set([str(x) for x in self.extract_links(response) if str(x)])
urls = [x for x in urls if not redis.sismember(SEEN_SET, x)]
if len(urls):
redis.sadd(FETCH_SET, *urls)
with open(self.filename_for(url, ext='urls'), 'w') as fh:
fh.write("\n".join(urls) + "\n")
except Exception as e:
log.error("Failed to parse URLs from %s: %s" % (url, e))
url = furl(redis.spop(FETCH_SET))
url = None
finally:
if url:
redis.sadd(FETCH_SET, str(url))
self.url_delete(url)
def request(self, url):
log.debug('fetching: %s', url)
res = requests.get(str(url), allow_redirects=False)
return dict(
url = str(url),
data = res.content,
status_code = res.status_code,
headers = dict(res.headers),
)