/
spider.py
executable file
·158 lines (137 loc) · 5.1 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python
#-*- coding:utf8-*-
#"for get all the html"
import BeautifulSoup,re,Queue
import urllib,urllib2
import optparse
import threading
import logging2
import threading2
import time
import doctest
import sqlite3
#I think i will get all the options and args
parser = optparse.OptionParser()
parser.add_option('-u','--urls', dest = "urls",help = "the urls ",metavar = "URLS")
parser.add_option('-d','--deep',type = 'int', dest = "deep",help = "the deep", metavar = "DEEP",default = 0)
parser.add_option('-f' , '--file',type = 'string' , dest = 'logfile', help = "the file is to record the logs", metavar = "LOGFILE",default = 'spider.log')
parser.add_option('-l','--loglevel',type = 'int' ,dest = 'loglevel',help = 'for loglevel(1-5),the number is big,the level is heigh',metavar = 'LOGLEVEL(1-5)',default = 1)
parser.add_option('--testself',dest = 'test',action = 'store_true',help = 'self test or test myself')
parser.add_option('-t','--thread',type = 'int',dest = 'number' ,help = 'the threadnumber',metavar = 'NUMBER',default =10)
parser.add_option('--dbfile',dest = 'dbfile' ,help = 'the db file path',metavar = 'DBFILE',default ='test.db')
parser.add_option('--key',dest = 'key' ,help = 'the key is here,if you want a chinese,you must be utf-8 in your system,or change the code.',metavar = 'KEY',default ='.*')
options,args = parser.parse_args()
print "all the optins",options
#workQueue is ready to do.resultQueue is having done.alldone is finish task
workQueue = Queue.Queue()
resultQueue = Queue.Queue()
alldoneQueue = Queue.Queue()
#logging2 初始化,以后可以使用logging2来插入log
logging2.init(options.loglevel,options.logfile)
#decode the key
keyinsys = options.key.decode('utf-8')
#sqlite3 here
#conn = sqlite3.connect(options.dbfile)
#cor = conn.cursor()
#cor.execute('create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)')
def analyseurl(urls):
"""
功能:分析urls,返回列表格式的字典
字典格式:{'name':names,'urls':url}
这里将符合要求的页面信息插入数据库,还包括日志信息
还包括 key的判断????
mm = re.compile('''\<a.*?href\=['|"](http\w*?)['|"].*?\>''')
"""
returns=[]
html = urllib2.urlopen(urls,timeout=50)
#print urls
#try:
if True:
data = html.read()
#soup = BeautifulSoup.BeautifulSoup(data)
#temp = soup.findAll('a',href=re.compile(r'http.*?\W'))#为什么不直接用re匹配a标签,使用beautifulsoup只能匹配出15个,怎么回事呢
mm = re.compile('''\<a\W*?href\="(http.*?)".*?\>''')
temp = mm.findall(data)
logging2.debug('analysing '+urls)
#print 'analysing'
for tt in temp:
returns.append(tt)
conn = sqlite3.connect(options.dbfile)
cor = conn.cursor()
cor.execute('create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)')
#print 0,'0'
rr = re.compile(r"""content\W*?\=\W*?["|']\W*?text\/html\W*?\;\W*?charset\W*?\=\W*?(\w*?)\W*?["|']""")
m = rr.search(data)
#print 1,'1'
if m:
#print 2
code = m.group(1)
try:
data = data.decode(code)
except UnicodeDecodeError,e:
#print e
logging2.error('decode from charset error')
#print 4
rekey = re.compile(keyinsys)#生成关键字匹配
good = rekey.search(data)
if good:
#print 'good'
data = data.replace("'",'"')#纠结的单引号怎么处理?
sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')"
try:
cor.execute(sqls%(urls,keyinsys,data))
except UnicodeDecodeError,e:
#print e
cor.execute(sqls%(urls,keyinsys,'decode error'))
logging2.error('reading '+urls+' decode error')
conn.commit()
#print 'donessss'
conn.close()
logging2.debug('reading '+urls)
logging2.info('what should i write here')
logging2.warning('a warning here')
logging2.error('a error test here')
logging2.critical('what is a critical??')
return returns
def main():
"""
执行入口,层次判断,任务转移.
>>> main()
时间 深度 当前完成 待完成
"""
i = 0
th = threading2.ThreadPool(workQueue,resultQueue,options.number)
td = threading2.MyThread2(workQueue,resultQueue,i,10)#屏幕打印进程
while i <= options.deep:#层次循环
if i == 0:
th.add_jobs(analyseurl,options.urls)
i += 1
th.wait_for_done()
while workQueue.qsize():
th.createThreadPool(options.number)
th.wait_for_done()
td.deep = i
else:#这里还有问题,现在为方案一
if resultQueue.qsize():#当前任务队列完成,结果队列满了
while resultQueue.qsize():#有任务,取出任务
t = resultQueue.get()
alldoneQueue.put(t)#这里要不要加入一个任务url的判断呢,可以判断域名,是否重复等等
th.add_jobs(analyseurl,t)#添加任务
###如何新建线程呢
while workQueue.qsize():#任务队列不为空
th.createThreadPool(options.number)
th.wait_for_done()
i += 1
td.deep = i
if resultQueue.qsize():
#print 'done and result to alldone'
while resultQueue.qsize():
t = resultQueue.get()
alldoneQueue.put(t)
print '干完了,结束'
return 0
if __name__ == '__main__':
if options.test:
doctest.testmod()
else:
main()