Example #1
0
def getWebpage(link='',
               dataDir='webpages',
               timeSleep=0,
               cookies='',
               reLoad=False,
               debug=False,
               read=True,
               referer='',
               info='',
               retry_num=10):
    link = link.strip()
    if link == '': return
    createPath(dataDir)
    fname = combinePath(dataDir, toFname(cookies + link + info))
    if not reLoad:
        try:
            f = open(fname, 'r')
            page = f.read()
            f.close()
            if debug: print 'read from cached file'
            return page
        except:
            pass

    if debug: print 'reading from web'
    time.sleep(timeSleep)
    for i in range(retry_num):
        try:
            page_info = urllib2.build_opener()
            page_info.addheaders = [('User-Agent', 'safari/536.25'),
                                    ('Cookie', cookies), ('Referer', referer)]
            page = page_info.open(link)
            if read:
                try:
                    page = page.read()
                except:
                    print 'error reading page, try again (until trying time reach 10)'
                    print link
                    continue
            break
        except (urllib2.HTTPError, urllib2.URLError), e:
            try:
                print e.code,
            except:
                pass
            page = ''
        time.sleep(timeSleep)
Example #2
0
def getWebpage(link='', dataDir='webpages', timeSleep=0, 
               cookies='', reLoad=False, debug=False, read=True,referer='',
               info='',retry_num=10):
    link=link.strip()
    if link=='': return
    createPath(dataDir)
    fname=combinePath(dataDir,toFname(cookies+link+info))
    if not reLoad:
        try:
            f=open(fname,'r')
            page=f.read()
            f.close()
            if debug: print 'read from cached file'
            return page
        except:
            pass
    
    if debug: print 'reading from web' 
    time.sleep(timeSleep)       
    for i in range(retry_num):
        try:
            page_info = urllib2.build_opener()
            page_info.addheaders = [('User-Agent', 'safari/536.25'),
                                    ('Cookie', cookies),
                                    ('Referer',referer)
                                    ]
            page = page_info.open(link)
            if read: 
                try:
                    page=page.read()
                except:
                    print 'error reading page, try again (until trying time reach 10)'
                    print link
                    continue
            break
        except (urllib2.HTTPError,urllib2.URLError), e:
            try:
                print e.code,
            except:
                pass
            page=''
        time.sleep(timeSleep)
Example #3
0
 createPath(album_name)
 i=0
 #print len(info)
 for name,url in info:
     i+=1
     ind=name.find('(')
     if ind>1: name=name[:ind]
     cap_content=name
     name=name.replace('/','')
     name=' '.join(name.split())
     name=name.replace(' ','_')   
     if name!='':
         fname=str(i)+'_'+name  
     else:
         fname=str(i)
     name=combinePath(album_name,fname)+'.jpg'     
     try:
         urllib.urlretrieve(url,name)
     except:
         print url,name
         continue
     if os.stat(name).st_size<1000:
         if type=='douban':
             url=url.replace('large','photo')
         if type=='renren':
             url=url.replace('original','large')
         try:
             urllib.urlretrieve(url,name)
         except:
             print url, name
             continue