def find_next_indexes(soup): ''' next page for an album index or an album page ''' indexes = soup.findAll('a', 'pix-navi-page') urls = [] if indexes: max_p = max([int(tag.string) for tag in indexes if tag.string.isdigit()]) result = urlsplit(httplib.html_unescape(indexes[0]['href'])) #i don't want patch urllib.unquote. bug description: http://bugs.python.org/issue1712522 #quick fix is convert to ascii. query_dict = parse_qs(result.query.encode('ascii')) for p in range(1, max_p + 1): query_dict['p'] = p result = SplitResult(result.scheme, result.netloc, result.path, urlencode(query_dict, doseq=True), result.fragment) urls.append(result.geturl()) return urls
def get_content_from_detail(soup, url): ''' <script id="pix-json-set-info" type="application/json"> { "albumId": 17327808, "likeStatus": false, "categoryId":"11", "categoryEname":"chongwu", "prevAlbumUrl": "", "nextAlbumUrl": "/detail/52085246?u=60786793" } </script> ''' picId = urlsplit(url).path.split('/')[-1] userId = parse_qs(urlsplit(url).query)['u'][0] albumId = json.loads(soup.find('script', id='pix-json-set-info').string)['albumId'] ajax_url = 'http://wantu.taobao.com/ajax/PicDetailAjax.do?picId=%s&userId=%s&albumId=%s&t=1365154666759&_method=read' ajax_url = ajax_url % (picId, userId, albumId) resp = json.loads(httplib.urlopen(ajax_url)[2].decode('gbk')) picture = resp['data']['models'][0]['picPath'] description = httplib.html_unescape(resp['data']['models'][0]['desc']) return (picture, description)