/
post_id_crawler.py
225 lines (183 loc) · 8.29 KB
/
post_id_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#coding:utf8
"""
抓取天涯论坛的论坛讨论帖ID(Thread ID)
NOTE: 请跟线程Thread自行区分...
Author: kqingchao@gmail.com
Date: 2014.5.12
"""
import sys
from urlparse import urljoin,urlparse
from collections import deque
import traceback
import logging
import time
from datetime import datetime
import pdb
import codecs # for file encodings
import os
from bs4 import BeautifulSoup
from lxml import etree # use XPath from lxml
from webPage import WebPage
from threadPool import ThreadPool
from patterns import *
#from models import Group
from logconfig import congifLogger
import stacktracer
log = logging.getLogger('Main.ThreadIDCrawler')
class PostIDCrawler(object):
def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000):
"""
`group_id` 待抓取的group id
`thread_num` 抓取的线程
`post_list_path` 保存所有的post id list的文件路径
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 保存topic的线程
# NOTE: 这里只允许一个保存进程,因为要操作同一个文件
self.save_thread = ThreadPool(1)
# 保存group相关信息
self.post_list_path = post_list_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
#待访问的小组讨论页面
self.unvisited_href = deque()
# 访问失败的页面链接
self.failed_href = set()
self.start_url = start_url
# 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
# 只保存thread-id
self.post_list = list()
self.is_crawling = False
# 每个Group抓取的最大topic个数
self.MAX_POST_NUM = max_post_num
#self.MAX_POST_NUM = float('inf')
# 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic
#self.MAX_TOPICS_PER_PAGE = 25
def start(self):
print '\nStart crawling post id list...\n'
self.is_crawling = True
self.thread_pool.startThreads()
self.save_thread.startThreads()
# 打开需要存储的文件
self.post_list_file = codecs.open(self.post_list_path, 'w', 'utf-8')
print "Add start url:", self.start_url
self.unvisited_href.append(self.start_url)
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignInitTask()
#等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取
#self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.thread_pool.getTaskLeft() > 0:
#print "Task left: ", self.thread_pool.getTaskLeft()
# 判断是否已经抓了足够多的thread id
if len(self.post_list) > self.MAX_POST_NUM:
print u'已经达到最大讨论帖抓取数,即将退出抓取。'
break
else:
print u'当前已抓取的讨论帖个数:', len(self.post_list)
time.sleep(3)
# 存储抓取的结果并等待存储线程结束
while self.save_thread.getTaskLeft() > 0:
print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft()
time.sleep(3)
log.info("Thread ID list crawling done.")
self.stop()
# 结束时可能还有任务,但是当前已经抓去了足够量的讨论帖
#assert(self.thread_pool.getTaskLeft() == 0)
# 关闭文件
self.post_list_file.close()
print "Main Crawling procedure finished!"
def stop(self):
self.is_crawling = False
self.thread_pool.stopThreads()
self.save_thread.stopThreads()
def _assignInitTask(self):
"""取出一个线程,并为这个线程分配任务,即抓取网页
"""
while len(self.unvisited_href) > 0:
# 从未访问的列表中抽出一个任务,并为其分配thread
url = self.unvisited_href.popleft()
self.thread_pool.putTask(self._taskHandler, url)
# 添加已经访问过的小组id
self.visited_href.add(url)
def _taskHandler(self, url):
""" 根据指定的url,抓取网页,并进行相应的访问控制
"""
print "Visiting : " + url
webPage = WebPage(url)
# 抓取页面内容
flag = webPage.fetch()
if flag:
url, pageSource = webPage.getDatas()
hrefs = self._getAllHrefsFromPage(url, pageSource)
# 找到有效的链接
post_list = []
next_page_url = None
for href in hrefs:
# 只有满足讨论帖链接格式的链接才会被处理
m = regex_post_first.match(href)
if self._isHttpOrHttpsProtocol(href) and m is not None:
post_list.append(m.group('post_id'))
# 在当前页面中查找匹配“下一页”的链接
m = regex_next_page.match(href)
if m != None and (not m.group() in self.visited_href):
url = m.group()
print 'Add next page link: ', url
self.thread_pool.putTask(self._taskHandler, url)
self.visited_href.add(url)
for post_id in post_list:
#print "Add thread link: ", thread
self.post_list.append(post_id)
# 存储已经抓取的topic list
self.save_thread.putTask(self._saveTopicHandler, post_list)
else:
log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url)
# if page reading fails
self.failed_href.add(url)
return False
def _saveTopicHandler(self, post_list):
""" 将每次从页面中抓取的topic id随时保存到文件中
NOTE: saveThread只有一个,所以这里不会造成访问冲突
"""
for tid in post_list:
self.post_list_file.write(tid + '\n')
self.post_list_file.flush()
os.fsync(self.post_list_file)
def _getAllHrefsFromPage(self, url, pageSource):
'''解析html源码,获取页面所有链接。返回链接列表'''
hrefs = []
soup = BeautifulSoup(pageSource)
results = soup.find_all('a',href=True)
for a in results:
#必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf
#在bs4中不会被自动url编码,从而导致encodeException
href = a.get('href').encode('utf8')
if not href.startswith('http'):
href = urljoin(url, href)#处理相对链接的问题
hrefs.append(href)
return hrefs
def _isHttpOrHttpsProtocol(self, href):
protocal = urlparse(href).scheme
if protocal == 'http' or protocal == 'https':
return True
return False
def _getAlreadyVisitedNum(self):
#visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedGroups数减去待访问的链接数
if len(self.visited_href) == 0:
return 0
else:
return len(self.visited_href) - self.thread_pool.getTaskLeft()
if __name__ == "__main__":
stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!
congifLogger("log/thread-id-crawler.log", 5)
# 从这个URL开始抓取,从这个页面抓取下一页的地址,只能是单各进程
section_id = 'free'
start_url = 'http://bbs.tianya.cn/list-%s-1.shtml' % (section_id)
print "Start URL:", start_url
base_path = '/home/kqc/dataset/tianya-forum/'
time_now = datetime.now()
post_id_list_path = 'post-id-list.txt'
tcrawler = PostIDCrawler(start_url, 1, post_id_list_path, max_post_num = 10000)
tcrawler.start()
stacktracer.trace_stop()