-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_spider.py
50 lines (42 loc) · 1.93 KB
/
main_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import url_manager
import html_downloader
import html_parser
import html_output
class MainSpider3(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.output = html_output.HtmlOutput()
self.headers = {
"User_Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36"
}
def run(self):
main_seed_url = self.urls.get_main_seed_url()
content = self.downloader.download(main_seed_url, retry_count=3, headers=self.headers).decode('utf8')
subject_urls = self.parser.parse_subject_urls(content)
for subject_url in subject_urls:
self._run_subject_url(subject_url)
def _run_subject_url(self, subject_url):
print('subject_url is ', subject_url)
content = self.downloader.download(subject_url, retry_count=2, headers=self.headers).decode('utf8')
mj_info = self.parser.parse_subject_mj_info(content)
if mj_info is None:
return
mj_max_count = int(mj_info['count'])
mj_namee = str(mj_info['mj_name'])
curr_count = 1
index = 1
while curr_count <= mj_max_count:
if index > 1:
subject_url = subject_url[0 : len(subject_url - 5)] + '_' + str(index) + '.html'
index = index + 1
#每个页面图片个数为4
curr_count = curr_count + 4
print('curreent url is ', subject_url)
content = self.downloader.download(subject_url, retry_count=2, headers=self.headers).decode('utf8')
pic_urls = self.parser.parse_page_pics(content)
for pic_url in pic_urls:
self.output.download_and_save(pic_url, mj_namee)
if __name__ == '__main__':
MainSpider3().run()