/
man3.py
executable file
·137 lines (121 loc) · 4.44 KB
/
man3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
#-*- coding:utf8-*-
#"for get all the html"
import BeautifulSoup,re,Queue
import urllib,urllib2
import optparse
import threading
import logging2
import doctest
import threading2
import time
import sqlite3
#I think i will get all the options and args
parser = optparse.OptionParser()
parser.add_option('-u','--urls', dest = "urls",help = "the urls ",metavar = "URLS")
parser.add_option('-d','--deep',type = 'int', dest = "deep",help = "the deep", metavar = "DEEP",default = 0)
parser.add_option('-f' , '--file',type = 'string' , dest = 'logfile', help = "the file is to record the logs", metavar = "LOGFILE",default = '/tmp/tmp.log')
parser.add_option('-l','--loglevel',type = 'int' ,dest = 'loglevel',help = 'for loglevel(1-5),the number is big,the level is heigh',metavar = 'LOGLEVEL(1-5)',default = 1)
parser.add_option('--testself',dest = 'test',help = 'self test or test myself')
parser.add_option('-t','--thread',type = 'int',dest = 'number' ,help = 'the threadnumber',metavar = 'NUMBER',default =10)
parser.add_option('--dbfile',dest = 'dbfile' ,help = 'the db file path',metavar = 'DBFILE',default ='test.db')
parser.add_option('--key',dest = 'key' ,help = 'the key is here,if you want a chinese,you must be utf-8 in your system,or change the code.',metavar = 'KEY',default ='.*')
options,args = parser.parse_args()
print "all the optins",options
print "all the args",args
#workQueue is ready to do.resultQueue is having done.alldone is finish task
workQueue = Queue.Queue()
resultQueue = Queue.Queue()
alldoneQueue = Queue.Queue()
#logging2 初始化,以后可以使用logging2来插入log
logging2.init(options.loglevel,options.logfile)
#decode the key
keyinsys = options.key.decode('utf-8')
print 'keyssss',keyinsys
#sqlite3 here
#conn = sqlite3.connect(options.dbfile)
#cor = conn.cursor()
#cor.execute('create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)')
def analyseurl(urls):
"""
功能:分析urls,返回列表格式的字典
字典格式:{'name':names,'urls':url}
这里将符合要求的页面信息插入数据库,还包括日志信息
还包括 key的判断????
"""
returns=[]
print urls
html = urllib2.urlopen(urls,timeout=50)
try:
conn = sqlite3.connect(options.dbfile)
cor = conn.cursor()
cor.execute('create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)')
data = html.read()
rr = re.compile(r"""content\=["|']text\/html\;charset\=(\w*?)["|']""")
m = rr.search(data)
if m:
code = m.group(1)
if code:
data = data.decode(code)
rekey = re.compile(keyinsys)
good = rekey.search(data)
if good:
data = data.replace("'",'"')#纠结的单引号怎么处理?
sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')"
cor.execute(sqls%(urls,keyinsys,data))
conn.commit()
conn.close()
logging2.debug('reading '+urls)
logging2.info('what should i write here')
logging2.warning('a warning here')
logging2.error('a error test here')
logging2.critical('what is a critical??')
#print 'reading'
except:
print 'error'
logging2.error('error ong reading '+urls)
soup = BeautifulSoup.BeautifulSoup(data)
temp = soup.findAll('a',href=re.compile(r'http.*'))#为什么不直接用re匹配a标签
logging2.debug('analysing '+urls)
#print 'analysing'
for tt in temp:
hrefs = tt['href']#have?
if hrefs.startswith('http'):
if tt.string:#span?????
returns.append({'name':tt.string,'urls':hrefs})
else:
returns.append({'name':'NoName','urls':hrefs})
else:
continue
return returns
def main():
i = 0
th = threading2.ThreadPool(workQueue,resultQueue,options.number)
td = threading2.MyThread2(workQueue,resultQueue,i,10)
while i <= options.deep:
if i == 0:
th.add_jobs(analyseurl,options.urls)
i += 1
th.wait_for_done()
td.deep = i
else:#这里还有问题,现在为方案一,每抓一层,要新建一次线程
if resultQueue.qsize():#当前任务队列完成,结果队列满了
while resultQueue.qsize():#有任务,取出任务
t = resultQueue.get()
alldoneQueue.put(t)
th.add_jobs(analyseurl,t['urls'])
###如何新建线程呢
th.createThreadPool(options.number)
th.wait_for_done()
i += 1
td.deep = i
if resultQueue.qsize():
#print 'done and result to alldone'
while resultQueue.qsize():
t = resultQueue.get()
alldoneQueue.put(t)
print '干完了,结束'
return 0
if __name__ == '__main__':
main()
print 'alldone',alldoneQueue.qsize()