-
Notifications
You must be signed in to change notification settings - Fork 0
/
douban.py
executable file
·116 lines (95 loc) · 4.02 KB
/
douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python2
#coding:utf8
from __future__ import print_function
import papa
import json
import datetime
import time
import re
from lxml import etree
from io import StringIO
class Douban() :
fetch_timeout = 5
tag_from_book_anchor = re.compile(u'<a[^>]*href="([^"]*focus=book)">([^<]*)<')
book_anchor = re.compile(u'<a[^>]*href="(http://book.douban.com/subject/[0-9]*/)"[^>]*>([^<]*)<',
re.S)
book_from_tag_anchor = re.compile(u'<a[^>]*href="(http://book.douban.com/subject/[0-9]*/)[^>]*>([^<]*)<',
re.S)
tag_from_tag_anchor = re.compile(u'<a href="(\?start=[0-9]*)"')
def gen_seeds(self):
yield ('sleep', 300) # rest for 300 seconds
yield ('forget',) # forget all the dynamic
#yield ('dynamic', 'http://www.douban.com/', 1)
#yield ('dynamic', 'http://www.douban.com/tag/科幻/book', 1)
yield ('dynamic', 'http://book.douban.com/subject/4105745/', 1)
def parse(self, url, content, tree):
p = etree.HTMLParser()
time.sleep(5)
print(url)
urls = []
root = url
if type(root) != str:
root = root.encode('utf8')
f = open('tmp.html', 'w')
print(content.encode('utf8'), file =f)
f.close()
"""bookpage"""
if 'subject' in root:
for x in re.findall(Douban.book_anchor, content):
url, title = x
title = title.strip()
if not title : continue
data = {'url':url.encode('utf8'), 'title':title.encode('utf8')}
yield ('save', data['url'], json.dumps(data, ensure_ascii = False))
yield ('static', url, 2)
#print(url, title)
for x in re.findall(Douban.tag_from_book_anchor, content):
url, title = x
url = url.partition('?')[0] + 'book'
#print(url, title)
data = {'url':url.encode('utf8'), 'title':title.encode('utf8')}
yield ('save', data['url'], json.dumps(data, ensure_ascii = False))
yield ('static', url, 2)
bookinfo = {'url' : root}
tree = etree.HTML(content)
for x in tree.xpath('//div[@id="info"]'):
info = etree.tostring(x, encoding = 'utf8')
info = info.replace(' ','')
info = info.replace('\n','')
bookinfo['info'] = info
break
for x in tree.xpath('//span[@property="v:itemreviewed"]'):
if x.text:
print(x.text)
bookinfo['title'] = x.text.encode('utf8')
data = bookinfo
if len(data) == 3:
yield ('save', 'parsed:'+data['url'], json.dumps(data, ensure_ascii = False))
return
"""tagpage"""
if 'tag' in root:
related = []
for x in re.findall(Douban.tag_from_tag_anchor, content):
url = x.encode('utf8')
title = root.split('/')[-2]
url = root.partition('?')[0] + url
data = {'url':url, 'title':title}
yield ('save', data['url'], json.dumps(data, ensure_ascii = False))
yield ('static', url, 2)
for x in re.findall(Douban.book_from_tag_anchor, content):
url, title = x
title = title.strip()
if not title : continue
related.append([url.encode('utf8'), title.encode('utf8')])
#print(url, title)
data = {'url':url.encode('utf8'), 'title':title.encode('utf8')}
yield ('save', data['url'], json.dumps(data, ensure_ascii = False))
yield ('static', url, 2)
if related :
data = {'url':root, 'related':related}
yield ('save', 'parsed:'+data['url'], json.dumps(data, ensure_ascii = False))
return
if False:
yield None
if __name__ == '__main__':
papa.quickstart(Douban(), 'douban')