def picurl(url,path): if os.path.exists(path): print path, 'exist' else: os.makedirs(path) html = '' while True: html = loadurl(url) if html == '': print 'load', url,'error' continue else: break #conn = urllib2.urlopen(url,data=None,timeout=2) #html = conn.read() rePicContent1 = '<div.*?id="picture.*?>.*?<p>(.*?)</p>' rePicContent2 = '<div.*?class="postContent.*?>.*?<p>(.*?)</p>' rePicList = '<img.*?src="(.*?)".*?>' picContent = re.findall(rePicContent1, html,re.S) if len(picContent) <=0: errorReport.errorIndex(url, rePicContent1) picContent = re.findall(rePicContent2, html,re.S) if len(picContent) <=0: errorReport.errorIndex(url, rePicContent2) print 'load false, over download this page and return' return False else: picList = re.findall(rePicList,picContent[0],re.S) pic_list(picList,path) #url = 'http://www.meizitu.com/a/454.html' #picurl(url,'/home/hus/Desktop/demo')
def oneOfSeries(urllist,path): searchname = '.*/(.*?).html' current_path = '' for url in urllist: try: name = re.findall(searchname,url,re.S) current_path = path + '/' + name[0] meizi_page_download.picurl(url,current_path) errorReport.success(url) except IndexError: errorReport.errorIndex(url, searchname)
def tag_series(url,path): #searchname = '.*/(.*?).html' #name = re.findall(searchname,url,re.S) #path = path + '/' + name[0] reSeriesList = '<div .*?class="pic".*?>.*?<a.*?href="(.*?)".*?target.*?>' html = '' while True: html = loadurl(url) if html == '': print 'load', url,'error' continue else: break seriesList = re.findall(reSeriesList,html,re.S) if len(seriesList) ==0: errorReport.errorIndex(url, reSeriesList) else: oneOfSeries(seriesList,path) #tag_series('http://www.meizitu.com/a/sifang.html','/home/hus/Desktop')