Exemple #1
0
def find_next_indexes(soup):
    '''
    next page for an album index or an album page
    '''
    indexes = soup.findAll('a', 'pix-navi-page')
    urls = []
    if indexes:
        max_p = max([int(tag.string) for tag in indexes if tag.string.isdigit()])
        result = urlsplit(httplib.html_unescape(indexes[0]['href']))
        #i don't want patch urllib.unquote. bug description: http://bugs.python.org/issue1712522
        #quick fix is convert to ascii.
        query_dict = parse_qs(result.query.encode('ascii'))
        for p in range(1, max_p + 1):
            query_dict['p'] = p
            result = SplitResult(result.scheme, result.netloc, result.path,
                                 urlencode(query_dict, doseq=True), result.fragment)
            urls.append(result.geturl())
    return urls
Exemple #2
0
def get_content_from_detail(soup, url):
    '''
    <script id="pix-json-set-info" type="application/json">
    {
        "albumId": 17327808,
        "likeStatus": false,
        "categoryId":"11",
        "categoryEname":"chongwu",
        "prevAlbumUrl": "",
        "nextAlbumUrl": "/detail/52085246?u=60786793" 
    }
    </script>
    '''
    picId = urlsplit(url).path.split('/')[-1]
    userId = parse_qs(urlsplit(url).query)['u'][0]
    albumId = json.loads(soup.find('script', id='pix-json-set-info').string)['albumId']
    ajax_url = 'http://wantu.taobao.com/ajax/PicDetailAjax.do?picId=%s&userId=%s&albumId=%s&t=1365154666759&_method=read'
    ajax_url = ajax_url % (picId, userId, albumId)
    resp = json.loads(httplib.urlopen(ajax_url)[2].decode('gbk'))
    picture = resp['data']['models'][0]['picPath']
    description = httplib.html_unescape(resp['data']['models'][0]['desc'])
    return (picture, description)