-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
111 lines (106 loc) · 3.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# coding=utf-8
import urladmin
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import functions
import logging
logging.basicConfig(format='%(asctime)s %(message)s', filename='program.log', level=logging.DEBUG)
urlhome = 'http://www.xx007.cn/'
siglineset = set()
phoneset = set()
# 加载siglineset
filename = "yichen.txt"
with open(filename) as f:
content = f.read().splitlines()
for line in content:
# print line
siglineset.add(line)
# 加载phoneset
file_phone = "phone.txt"
with open(file_phone) as f:
content = f.read().splitlines()
for line in content:
phoneset.add(line)
logging.info('geting allnoteurl start')
links = urladmin.getallnoteurl()
logging.info('parsing start')
for link in links:
print link
note = ""
notesoup = ""
try:
note = urllib2.urlopen(link, timeout=30).read()
note = note.decode('gb18030')
notesoup = BeautifulSoup(note)
except:
continue
# 帖子的第一页
siglines = notesoup.findAll('div', attrs={'style': 'width:85%;overflow-x: hidden;'})
for sigline in siglines:
strsigline = str(sigline)
strsigline = strsigline[strsigline.index('<img'):]
strsigline = "".join(strsigline.split())
if strsigline not in siglineset:
siglineset.add(strsigline)
print strsigline
myfile = open(filename, 'a+')
myfile.write(strsigline + "\n")
myfile.close()
# myphone = functions.get_phone(strsigline)
# if myphone not in phoneset:
# phoneset.add(myphone)
# print myphone
# thefile = open(file_phone, 'a+')
# thefile.write(myphone + "\n")
# thefile.close()
myphones = functions.get_phones(strsigline)
for myphone in myphones:
if myphone not in phoneset:
phoneset.add(myphone)
print myphone
thefile = open(file_phone, 'a+')
thefile.write(myphone + "\n")
thefile.close()
# 帖子的其他页
uurls = notesoup.findAll('a', attrs={'href': re.compile('^dispbbs.*')})
if len(uurls):
uurls.pop()
for u in uurls:
otherurl = urlhome + u["href"]
print otherurl
othernote = ""
othersoup = ""
try:
othernote = urllib2.urlopen(otherurl, timeout=30).read()
othernote = othernote.decode('gb18030')
othersoup = BeautifulSoup(othernote)
except:
continue
siglines = othersoup.findAll('div', attrs={'style': 'width:85%;overflow-x: hidden;'})
for sigline in siglines:
strsigline = str(sigline)
strsigline = strsigline[strsigline.index('<img'):]
strsigline = "".join(strsigline.split())
if strsigline not in siglineset:
siglineset.add(strsigline)
print strsigline
myfile = open(filename, 'a+')
myfile.write(strsigline + "\n")
myfile.close()
myphones = functions.get_phones(strsigline)
for myphone in myphones:
if myphone not in phoneset:
phoneset.add(myphone)
print myphone
thefile = open(file_phone, 'a+')
thefile.write(myphone + "\n")
thefile.close()
# myphone = functions.get_phone(strsigline)
# if myphone not in phoneset:
# phoneset.add(myphone)
# print myphone
# thefile = open(file_phone, 'a+')
# thefile.write(myphone + "\n")
# thefile.close()
logging.info('parsing over')