def getWebpage(link='',
               dataDir='webpages',
               timeSleep=0,
               cookies='',
               reLoad=False,
               debug=False,
               read=True,
               referer='',
               info='',
               retry_num=10):
    link = link.strip()
    if link == '': return
    createPath(dataDir)
    fname = combinePath(dataDir, toFname(cookies + link + info))
    if not reLoad:
        try:
            f = open(fname, 'r')
            page = f.read()
            f.close()
            if debug: print 'read from cached file'
            return page
        except:
            pass

    if debug: print 'reading from web'
    time.sleep(timeSleep)
    for i in range(retry_num):
        try:
            page_info = urllib2.build_opener()
            page_info.addheaders = [('User-Agent', 'safari/536.25'),
                                    ('Cookie', cookies), ('Referer', referer)]
            page = page_info.open(link)
            if read:
                try:
                    page = page.read()
                except:
                    print 'error reading page, try again (until trying time reach 10)'
                    print link
                    continue
            break
        except (urllib2.HTTPError, urllib2.URLError), e:
            try:
                print e.code,
            except:
                pass
            page = ''
        time.sleep(timeSleep)
def getWebpage(link='', dataDir='webpages', timeSleep=0, 
               cookies='', reLoad=False, debug=False, read=True,referer='',
               info='',retry_num=10):
    link=link.strip()
    if link=='': return
    createPath(dataDir)
    fname=combinePath(dataDir,toFname(cookies+link+info))
    if not reLoad:
        try:
            f=open(fname,'r')
            page=f.read()
            f.close()
            if debug: print 'read from cached file'
            return page
        except:
            pass
    
    if debug: print 'reading from web' 
    time.sleep(timeSleep)       
    for i in range(retry_num):
        try:
            page_info = urllib2.build_opener()
            page_info.addheaders = [('User-Agent', 'safari/536.25'),
                                    ('Cookie', cookies),
                                    ('Referer',referer)
                                    ]
            page = page_info.open(link)
            if read: 
                try:
                    page=page.read()
                except:
                    print 'error reading page, try again (until trying time reach 10)'
                    print link
                    continue
            break
        except (urllib2.HTTPError,urllib2.URLError), e:
            try:
                print e.code,
            except:
                pass
            page=''
        time.sleep(timeSleep)
Exemple #3
0
    for photo in photos:
        img=photo.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name=img['alt']
        if img.has_key('data-src'): 
            url=img['data-src']
        else:
            url=img['src']
        url=url.replace('head','original')
        info.append((url,name))
        
    return (album_name,info)
    
def fetchAlbum((album_name,info),caption=False,type=None):    
    createPath(album_name)
    i=0
    #print len(info)
    for name,url in info:
        i+=1
        ind=name.find('(')
        if ind>1: name=name[:ind]
        cap_content=name
        name=name.replace('/','')
        name=' '.join(name.split())
        name=name.replace(' ','_')   
        if name!='':
            fname=str(i)+'_'+name  
        else:
            fname=str(i)
        name=combinePath(album_name,fname)+'.jpg'