/
meizi_series_getpage.py
56 lines (50 loc) · 1.52 KB
/
meizi_series_getpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 29 13:23:05 2015
@author: hus
"""
import re
import urllib2
import meizi_page_download
import errorReport
#url = 'http://www.meizitu.com/a/5195.html'
#meizitu_download.picurl(url,'/home/hus/Desktop','1')
def loadurl(url):
try:
conn = urllib2.urlopen(url,data=None,timeout=5)
html = conn.read()
return html
except Exception:
errorReport.errorLoadUrl(url)
return ''
def oneOfSeries(urllist,path):
searchname = '.*/(.*?).html'
current_path = ''
for url in urllist:
try:
name = re.findall(searchname,url,re.S)
current_path = path + '/' + name[0]
meizi_page_download.picurl(url,current_path)
errorReport.success(url)
except IndexError:
errorReport.errorIndex(url, searchname)
#获得一个系列中,每页中的套图url地址
def tag_series(url,path):
#searchname = '.*/(.*?).html'
#name = re.findall(searchname,url,re.S)
#path = path + '/' + name[0]
reSeriesList = '<div .*?class="pic".*?>.*?<a.*?href="(.*?)".*?target.*?>'
html = ''
while True:
html = loadurl(url)
if html == '':
print 'load', url,'error'
continue
else:
break
seriesList = re.findall(reSeriesList,html,re.S)
if len(seriesList) ==0:
errorReport.errorIndex(url, reSeriesList)
else:
oneOfSeries(seriesList,path)
#tag_series('http://www.meizitu.com/a/sifang.html','/home/hus/Desktop')