/
app.py
148 lines (116 loc) · 4.33 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python
# -*- coding: utf-8 -*-
import logging
import logging.config
import time
import hashlib
import re
import json
import urllib
import urllib2
import random
import sys
import base64
import jieba
from flask import Flask, request
from flask_restful import Resource, Api
from bs4 import BeautifulSoup
import requests
from pyquery import PyQuery as pq
from db import MongoDBPipeline
from test import DB
from test import skip
reload(sys)
sys.setdefaultencoding('utf8')
UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
class spider(object):
def pa(self, url, start, limit):
md = MongoDBPipeline(DB['db'], DB['col'], DB['address'])
while start <= limit:
url = url.split('=').pop(0) + '='
url += str(start)
start += 25
print url
s = requests.session()
headers = {"User-Agent": UA}
s.headers.update(headers)
r = s.get(url, timeout=(2, 4))
html = r.text
# print html
# soup = BeautifulSoup(html, 'lxml')
# p = re.compile(r'\?wx_fmt.+?\"')
# content = str(soup)
d = pq(html)
# item =
aArray = d.find('.olt').find('tr')
items = []
for i in aArray:
it = pq(i)
clazz = it.attr('class')
if None == clazz: ## class 为 None的tr标签
print it
elif "" == clazz: # class 为 "" 的标签
trItems = it.children()
item = {}
for j in range(0, 4):
jItem = pq(trItems[j])
href = item.get('href')
if href != None:
item['md5'] = hashlib.new("md5", item['href']).hexdigest()
tmp = md.find({'md5': item['md5']})
if tmp != None:
item['md5'] = ''
if j == 0:
item['title'] = jItem.find('a').attr('title')
item['href'] = jItem.find('a').attr('href')
elif j == 1:
item['author'] = jItem.text()
item['author_link'] = jItem.find('a').attr('href')
elif j == 2:
item['reply_count'] = jItem.text()
else:
item['last_reply'] = jItem.text()
if item.get('md5') != None and item.get('md5') != '':
items.append(item)
print items.__len__()
if items.__len__() > 0:
self.loop_article(items,s,pq)
times = 0
for i in items:
times += 1
md.save(i)
print('saved ' + str(times) + ' records')
time.sleep(5)
def loop_article(self, items,s,pq):
print 'coming in to fetch_article..'
if items != None:
for i in items:
href = i.get('href')
content = self.fetch_article(href,s,pq)
#print content
if content != None:
sub_list = jieba.cut_for_search(content)
if sub_list !=None:
sub_content = []
for tag in sub_list:
if tag != ' ':
sub_content.append(tag)
## filter skipped words
if sub_content != None and sub_content.__len__()>0:
sub_content = list(set(sub_content).difference(set(skip)))
i['sub_list'] = sub_content
else:
i['sub_list'] = []
print 'href '+href +' content is none'
return items
def fetch_article(self, href,s,pq):
print href
r = s.get(href, timeout=(4, 8))
html = r.text
d = pq(html)
content = d.find('.topic-content').find('p').html()
time.sleep(2)
return content
if __name__ == '__main__':
spider = spider()
spider.pa('https://www.douban.com/group/fangzi/discussion?start=', 0,500)