def getWebpage(link='', dataDir='webpages', timeSleep=0, cookies='', reLoad=False, debug=False, read=True, referer='', info='', retry_num=10): link = link.strip() if link == '': return createPath(dataDir) fname = combinePath(dataDir, toFname(cookies + link + info)) if not reLoad: try: f = open(fname, 'r') page = f.read() f.close() if debug: print 'read from cached file' return page except: pass if debug: print 'reading from web' time.sleep(timeSleep) for i in range(retry_num): try: page_info = urllib2.build_opener() page_info.addheaders = [('User-Agent', 'safari/536.25'), ('Cookie', cookies), ('Referer', referer)] page = page_info.open(link) if read: try: page = page.read() except: print 'error reading page, try again (until trying time reach 10)' print link continue break except (urllib2.HTTPError, urllib2.URLError), e: try: print e.code, except: pass page = '' time.sleep(timeSleep)
def getWebpage(link='', dataDir='webpages', timeSleep=0, cookies='', reLoad=False, debug=False, read=True,referer='', info='',retry_num=10): link=link.strip() if link=='': return createPath(dataDir) fname=combinePath(dataDir,toFname(cookies+link+info)) if not reLoad: try: f=open(fname,'r') page=f.read() f.close() if debug: print 'read from cached file' return page except: pass if debug: print 'reading from web' time.sleep(timeSleep) for i in range(retry_num): try: page_info = urllib2.build_opener() page_info.addheaders = [('User-Agent', 'safari/536.25'), ('Cookie', cookies), ('Referer',referer) ] page = page_info.open(link) if read: try: page=page.read() except: print 'error reading page, try again (until trying time reach 10)' print link continue break except (urllib2.HTTPError,urllib2.URLError), e: try: print e.code, except: pass page='' time.sleep(timeSleep)
createPath(album_name) i=0 #print len(info) for name,url in info: i+=1 ind=name.find('(') if ind>1: name=name[:ind] cap_content=name name=name.replace('/','') name=' '.join(name.split()) name=name.replace(' ','_') if name!='': fname=str(i)+'_'+name else: fname=str(i) name=combinePath(album_name,fname)+'.jpg' try: urllib.urlretrieve(url,name) except: print url,name continue if os.stat(name).st_size<1000: if type=='douban': url=url.replace('large','photo') if type=='renren': url=url.replace('original','large') try: urllib.urlretrieve(url,name) except: print url, name continue