/
man2.py
executable file
·113 lines (98 loc) · 3.33 KB
/
man2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/python
#-*- coding:utf8-*-
#"for get all the html"
import BeautifulSoup,re,Queue
import urllib,urllib2
import optparse
import threading
import logging
import logging2
import doctest
import threading2
import time
import sqlite3
#I think i will get all the options and args
parser = optparse.OptionParser()
parser.add_option('-u','--urls', dest = "urls",help = "the urls ",metavar = "URLS")
parser.add_option('-d','--deep',type = 'int', dest = "deep",help = "the deep", metavar = "DEEP",default = 0)
parser.add_option('-f' , '--file',type = 'string' , dest = 'logfile', help = "the file is to record the logs", metavar = "LOGFILE",default = '/tmp/tmp.log')
parser.add_option('-l','--loglevel',type = 'int' ,dest = 'loglevel',help = 'for loglevel(1-5),the number is big,the level is heigh',metavar = 'LOGLEVEL(1-5)',default = 1)
parser.add_option('--testself',dest = 'test',help = 'self test or test myself')
parser.add_option('-t','--thread',type = 'int',dest = 'number' ,help = 'the threadnumber',metavar = 'NUMBER',default =10)
parser.add_option('--dbfile',dest = 'filepath' ,help = 'the db file path',metavar = 'DBFILE',default ='test.db')
options,args = parser.parse_args()
print "all the optins",options
print "all the args",args
#workQueue is ready to do.resultQueue is having done.alldone is finish task
workQueue = Queue.Queue()
resultQueue = Queue.Queue()
alldoneQueue = Queue.Queue()
#logging2 初始化,以后可以使用logging2来插入log
logging2.init(options.loglevel,options.logfile)
def analyseurl(urls):
"""
功能:分析urls,返回列表格式的字典
字典格式:{'name':names,'urls':url}
这里将符合要求的页面信息插入数据库,还包括日志信息
"""
returns=[]
#print urls
html = urllib2.urlopen(urls,timeout=30)
try:
data = html.read()
rr = re.compile(r"""content\=["|']text\/html\;charset\=(\w*?)["|']""")
m = rr.search(data)
if m:
code = m.group(1)
if code:
data = data.decode(code)
logging2.debug('reading')
#print 'reading'
except:
logging2.error('error ong reading')
soup = BeautifulSoup.BeautifulSoup(data)
temp = soup.findAll('a',href=re.compile(r'http.*'))
logging2.debug('analysing')
#print 'analysing'
for tt in temp:
hrefs = tt['href']#have?
if hrefs.startswith('http'):
if tt.string:#span?????
returns.append({'name':tt.string,'urls':hrefs})
else:
returns.append({'name':'NoName','urls':hrefs})
else:
continue
return returns
def main():
i = 0
th = threading2.ThreadPool(workQueue,resultQueue,options.number)
td = threading2.MyThread2(workQueue,resultQueue,i,10)
while i <= options.deep:
if i == 0:
th.add_jobs(analyseurl,options.urls)
i += 1
th.wait_for_done()
td.deep = i
else:#这里还有问题,现在为方案一
if resultQueue.qsize():#当前任务队列完成,结果队列满了
while resultQueue.qsize():#有任务,取出任务
t = resultQueue.get()
alldoneQueue.put(t)
th.add_jobs(analyseurl,t['urls'])
###如何新建线程呢
th.createThreadPool(options.number)
th.wait_for_done()
i += 1
td.deep = i
if resultQueue.qsize():
#print 'done and result to alldone'
while resultQueue.qsize():
t = resultQueue.get()
alldoneQueue.put(t)
print '干完了,结束'
return 0
if __name__ == '__main__':
#analyseurl('http://www.baidu.com')
main()
print 'alldone',alldoneQueue.qsize()