-
Notifications
You must be signed in to change notification settings - Fork 0
/
web2feed.py
544 lines (432 loc) · 14 KB
/
web2feed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
#!/usr/bin/env python2.6
# Brandon Thomas <echelon@gmail.com>
# http://possibilistic.org
# Web2Feed, a work in progress web scraping tool.
# http://github.com/echelon/web2feed
# Copyright 2010. BSD/MIT licensed.
# Utilizes GPL-licensed code, so combined distribution is GPL.
import os
import sys
# Not sure if this is a good idea...
sys.path.insert(0, os.path.join(os.path.dirname(
os.path.abspath(__file__)), 'libs'))
from urlparse import urlparse, urljoin
import httplib
from BeautifulSoup import BeautifulSoup
import re
# For fixing what BeautifulSoup can't parse.
import html5lib
from html5lib import sanitizer, treebuilders
# For PageCache
import time
import hashlib
from datetime import datetime, timedelta
try:
from StringIO import StringIO
except:
from cStringIO import StringIO
import simplejson as json # req python2.5
from libs import iso8601
from libs.robotexclusionrulesparser import RobotExclusionRulesParser
import htmlentitydefs # for terminal output
import textwrap # for terminal output
from libs.html2text import html2text # XXX: GPL LICENSED!
from mapper import get_scraper
# ============ Library API ======================
# TODO/XXX: DEPRECATE web2feed()
def web2feed(uri, content=False, timeout=15, redirect_max=2, cache=False):
"""Deprecated API for web2feed. Content can be passed or downloaded
by web2feed."""
wf = Web2Feed(uri)
if content:
wf.set_contents(content)
else:
wf.set_caching(cache)
return wf.get_feed()
class Web2Feed(object):
"""Web2Feed, an API for scraping websites for content."""
def __init__(self, uri, contents=''):
"""Supply the URI and optionally prefetched content."""
self.uri = fix_uri(uri)
self.contents = contents
self.uagent = 'web2feed.py (http://github.com/echelon/web2feed) -- ' \
'Scraping content for the distributed web'
self.robot_rules_str = ''
# Robots.txt rules policy
self.obey_rules = True
self.allow_on_fail = True
# Page caching rules (same for requested page and robots.txt)
self.use_caching = False
self.cache_timeout = timedelta(minutes=10)
self.cache_dir = './cache'
# If prefetched content not supplied, autofetch?
self.do_auto_fetch = True
self.do_auto_fetch_rules = True
# Cached scraper, purged if contents set or fetched.
self.scraper = None
def set_contents(self, contents):
"""Supply externally-fetched content."""
self.contents = contents
self.do_auto_fetch = False
self.do_auto_fetch_rules = False
self.scraper = None
def get_contents(self):
return self.contents
def set_uagent(self, uagent):
"""Set the user-agent for fetching content."""
self.uagent = uagent
def set_robot_rules(self, rules_str):
"""Supply the robots.txt file if externally-fetched."""
self.robot_rules_str = rules_str
self.do_auto_fetch_rules = False
def get_robot_rules(self):
return self.robot_rules_str
def set_robot_rules_policy(self, obey_rules, allow_on_fail=True):
"""Set he robot.txt rules policy."""
self.obey_rules = obey_rules
self.allow_on_fail = allow_on_fail
def set_caching(self, use_caching, cache_timeout=timedelta(minutes=10),
cache_dir='./cache'):
self.use_caching = use_caching
self.cache_timeout = cache_timeout
self.cache_dir = cache_dir
def fetch(self, timeout=15, redirect_max=2):
"""Fetch the page with web2feed's libraries."""
self.do_auto_fetch = False
self.scraper = None
# Robots.txt handling
if self.obey_rules:
if self.do_auto_fetch_rules:
self.fetch_rules(timeout, redirect_max)
if not self.robot_rules_str and not self.allow_on_fail:
print "robots.txt could not be fetched, but policy " \
"indicates we must obtain it."
return False
if self.robot_rules_str and not \
robot_is_allowed(self.uri, self.uagent,
rules_str=self.robot_rules_str):
print "robots.txt indicates we cannot download " \
"the resource at '%s'" % self.uri
return False
# TODO: Error handling
contents = get_page(self.uri,
timeout=timeout,
redirect_max=redirect_max,
caching=self.use_caching,
cache_period=self.cache_timeout,
cache_dir=self.cache_dir,
uagent=self.uagent)
self.contents = contents
def fetch_rules(self, timeout=15, redirect_max=2):
"""Fetch the robots.txt with web2feed's libraries."""
self.do_auto_fetch_rules = False
robots_uri = urljoin(self.uri, '/robots.txt')
# TODO: Error handling
rules = get_page(robots_uri,
timeout=timeout,
redirect_max=redirect_max,
caching=self.use_caching,
cache_period=self.cache_timeout,
cache_dir=self.cache_dir,
uagent=self.uagent)
self.robot_rules_str = rules
def get_feed(self):
"""Scrape the page for semantic content."""
def get_formatted(scraper):
return {
'meta': scraper.get_meta(),
'feed': scraper.get_feed(),
}
if self.scraper:
return get_formatted(self.scraper)
if not self.contents and self.do_auto_fetch:
print "Web2Feed.get_feed() Attempting to fetch..."
self.fetch()
if not self.contents:
#raise Exception, "No content to scrape!"
print "Web2Feed.get_feed() unable to parse: no contents!"
return False
# TODO: Cache the scraper.
self.scraper = get_scraper(self.contents, self.uri)
return get_formatted(self.scraper)
# ============ Run from terminal ================
def main():
wf = Web2Feed(sys.argv[1])
wf.set_caching(True)
wf.set_robot_rules_policy(True, False)
print wf.get_feed()
sys.exit()
content = get_page(uri)
print "Content len: %d" % 0 if not content else len(content)
sc = get_scraper(content, uri)
#print sc.get_feed()
print sc.get_plaintext(80)
#map_domain_to_module(domain)
# ============ Downloading / Caching ============
def get_page(uri, timeout=10, redirect_max=2, caching=True, uagent=None,
cache_period=timedelta(minutes=10),
cache_dir='./cache', cache_ext='.html'):
"""Get the page from online or the cache."""
# TODO: KeepAlive connections to host for batch fetching
# of multiple pages or media
# ======== Nested Definitions ===============
def download(uri, timeout=10, redirect_max=2, redirect_cnt=0, uagent=None):
"""Download the contents at the URI specified."""
print "Downloading %s ..." % uri
u = urlparse(uri)
c = None
if sys.version_info >= (2, 6):
c = httplib.HTTPConnection(u.netloc, timeout=timeout)
else:
c = httplib.HTTPConnection(u.netloc) # timeout added in 2.6
if uagent==None:
uagent = 'web2feed.py (http://github.com/echelon/web2feed) ' \
'-- Scraping content for the distributed web'
h = {'User-Agent': uagent}
c.request('GET', u.path, headers=h)
resp = c.getresponse()
if (300 <= resp.status < 400):
if redirect_cnt > redirect_max:
raise Exception, "Too many redirects."
# TODO: Handle relative locations.
newloc = resp.getheader('location')
return download(newloc, timeout, redirect_max, redirect_cnt+1)
else:
return resp.read()
class PageCache(object):
"""Caches pages so it's faster to develop scraping logic for
different websites. This feature isn't necessary in
production."""
PREFIX = './cache'
SUFFIX = '.html'
def __init__(self, uri):
self.uri = uri
def filename(self):
"""Get the cache filename."""
hname = hashlib.md5(self.uri).hexdigest()
return self.PREFIX + '/' + hname + self.SUFFIX
def exists(self):
"""Does the cache file exist?"""
return os.path.exists(self.filename())
def expired(self, td=timedelta(minutes=10)):
"""Has the cache file gone stale?"""
try:
ts = os.path.getmtime(self.filename())
time = datetime.fromtimestamp(ts)
return datetime.today() > time + td
except:
return True
def read(self):
"""Read from the cache file."""
f = open(self.filename(), 'r')
c = f.read()
f.close()
return c
def write(self, contents):
"""Save to the cache file."""
f = open(self.filename(), 'w')
f.write(contents)
f.close()
# ======== Function Body ====================
if not caching:
try:
return download(uri, timeout, redirect_max)
except:
print "Download failed."
return False
PageCache.PREFIX = cache_dir
PageCache.SUFFIX = cache_ext
cache = PageCache(uri)
if cache.exists():
if cache.expired(cache_period):
try:
content = download(uri, timeout, redirect_max, uagent)
cache.write(content)
return content
except:
print "Download failed."
return cache.read()
else:
content = download(uri, timeout, redirect_max, uagent)
cache.write(content)
return content
# ============ Robots.txt Rule Checking =========
def robot_is_allowed(uri, uagent, allow_on_fail=True, rules_str='', cache=True,
cache_period=timedelta(minutes=10)):
"""Check if the robot is allowed to fetch the desired URI.
The URI must be absolute.
Can make decision policy when robots.txt was unable to be fetched.
Can supply the already-fetched rules string. If letting web2feed
download robots.txt, it can be cached."""
if not rules_str:
robots_uri = urljoin(uri, '/robots.txt')
# TODO: better check.
if robots_uri == '/robots.txt':
print 'Non-absolute uri for robot_is_allowed: %s' % uri
return allow_on_fail
print 'ROBOT URI: %s' % robots_uri
rules_str = get_page(robots_uri, caching=cache,
cache_period=cache_period)
if not rules_str:
return allow_on_fail
rparse = RobotExclusionRulesParser()
rparse.parse(rules_str)
return rparse.is_allowed(uagent, uri)
# ============ Scraping Content =================
class Scraper(object):
"""Scrape the content off a page."""
def __init__(self, contents, uri):
# Web location page was downloaded from (if known)
self.uri = uri
# HTML DOM Tree.
# Only used in parsing.
self.soup = self._parse(contents)
# Meta information such as <title> and description
self.meta = {}
# Parsed out feed content
# Used for any presentation.
# TODO: Do cleaning, etc. on this.
self.feed = self._extract_feed()
# TODO: Bad form
self._guess_format()
self._add_markdown()
def _guess_format(self):
"""Guess the format of the data returned."""
for i in range(len(self.feed)):
x = self.feed[i]
guess_contents = not 'contents_format' in x
guess_summary = not 'summary_format' in x
if 'contents' in x and guess_contents:
if re.search(r'<\w+>', x['contents']):
x['contents_format'] = 'text/html'
else:
x['contents_format'] = 'text/plain'
if 'summary' in x and guess_summary:
if re.search(r'<\w+>', x['summary']):
x['summary_format'] = 'text/html'
else:
x['summary_format'] = 'text/plain'
def _add_markdown(self):
"""Add markdown version of the content."""
for i in range(len(self.feed)):
x = self.feed[i]
if 'contents' in x:
x['contents_markdown'] = html2text(x['contents'], self.uri)
if 'summary' in x:
x['summary_markdown'] = html2text(x['summary'], self.uri)
def get_feed(self):
"""Get the feed."""
return self.feed
def get_meta(self):
"""Get the meta info."""
return self.meta
def get_json(self):
"""Get the feed serialized in JSON."""
# Properly handle datetime objects
dthandle = lambda o: o.isoformat() if isinstance(o, datetime) else None
return json.dumps(self.feed, default=dthandle)
def get_plaintext(self, wrap=80):
"""Output a feed to the terminal. This can be used for
debugging or for reading/wasting time."""
# TODO: multiple feed types.
def format(title, date, author, contents, wrap):
sz = 0
lines = []
if date and author:
title += ' (by %s on %s)' % (author, date)
elif date or author:
title += ' (%s)' % ('on ' + date) if date else ('by ' + author)
lines += textwrap.wrap(title, wrap)
for ln in lines:
s = len(ln)
if s > sz:
sz = s
lines.append('-'*sz)
lines += textwrap.wrap(contents, wrap)
return '\n'.join(lines)
def contents_or_summary(story):
"""Get the contents or summary."""
contents = ''
if 'contents' in story:
contents = story['contents']
elif 'summary' in story:
contents = story['summary']
return contents
feed = self.feed
if type(feed) is dict:
feed = [feed,]
ret = ''
for story in feed:
cts = contents_or_summary(story)
cts = ''.join(BeautifulSoup(cts).findAll(text=True)) # html->txt
cts = entity_unescape(cts)
title = entity_unescape(story['title'])
date = ''
if 'date' in story and story['date']:
date = story['date'].strftime('%B %d, %H:%M')
author = ''
if 'author' in story and story['author']:
author = story['author']
ret += format(title, date, author, cts, wrap)
ret += "\n\n"
return ret
def _extract_feed(self):
"""Overload for parsing out the feed contents."""
return None
@staticmethod
def _parse(content):
"""Parse HTML content with BeautifulSoup."""
try:
return BeautifulSoup(content)
except:
print "HTMLParser error. Trying libxml."
content = Scraper._parser_fallback(content)
f = open('out', 'w')
f.write(content)
f.close()
return BeautifulSoup(content)
#self.soup = content
@staticmethod
def _parser_fallback(content):
"""If BeautifulSoup's parser fails, try libxml."""
parser = html5lib.HTMLParser(
tree=treebuilders.getTreeBuilder('beautifulsoup'))
soup = parser.parse(StringIO(content))
return soup.prettify()
# ============ String Helper Funcs ==============
def fix_uri(uri):
"""Fix a partial URI (eg. no scheme, etc.)"""
u = urlparse(uri)
if not u.scheme:
u = urlparse(urljoin('http://', '//' + u.geturl()))
return u.geturl()
def parse_iso_date(date_str):
"""Wrapper around iso8601 library."""
return iso8601.parse_date(date_str)
def remove_ordinal(date_str):
"""Remove ordinal that datetime library can't parse."""
return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
def entity_unescape(text):
"""Turn htmlentities into unicode.
From http://effbot.org/zone/re-sub.htm#unescape-html"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
if __name__ == '__main__':
main()