def worker_process_pic(gearman_worker, gearman_job): url = gearman_job.data if wantudal.is_processed(url): logging.debug('%s is skipped as it was processed already' % (url)) return logging.debug('processing %s' % (url)) try: content = httplib.urlopen(url)[2] except HTTPError, e: logging.debug('http error: %s' % (e.code)) return
def worker_process_html(gearman_worker, gearman_job): url = gearman_job.data if wantudal.is_processed(url): logging.debug('%s is skipped as it was processed already' % (url)) return logging.debug('processing %s' % (url)) #this web is encoded by gbk html_doc = httplib.urlopen(url)[2].decode('gbk') soup = BeautifulSoup(html_doc, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) if is_album_index(url): albums = get_album_from_index(soup) logging.debug('%d albums are found' % (len(albums))) more_indexes = find_next_indexes(soup) logging.debug('%d sub indexes are found' % (len(more_indexes))) wantudal.save_url(url, pagetype=wantudal.PageType.AlbumIndex, isfinished=1) for album in albums: wantudal.save_url(album, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0) submit_html_job(album) for index in more_indexes: wantudal.save_url(index, pagetype=wantudal.PageType.AlbumIndex, referrerurl=url, isfinished=0) submit_html_job(index) elif is_album_page(url): details = get_detail_from_album(soup, url) logging.debug('%d details are found' % (len(details))) more_indexes = find_next_indexes(soup) logging.debug('%d sub indexes are found' % (len(more_indexes))) wantudal.save_url(url, pagetype=wantudal.PageType.AlbumPage, isfinished=1) for detail in details: wantudal.save_url(detail, pagetype=wantudal.PageType.DetailPage, referrerurl=url, isfinished=0) submit_html_job(detail) for index in more_indexes: wantudal.save_url(index, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0) submit_html_job(index) elif is_detail_page(url): pic, description = get_content_from_detail(soup, url) wantudal.save_url(url, pagetype=wantudal.PageType.DetailPage, isfinished=1) wantudal.save_url(pic, pagetype=wantudal.PageType.PicturePage, referrerurl=url, description=description, isfinished=0) submit_pic_job(pic) else: logging.debug('unknown resource')
def get_content_from_detail(soup, url): ''' <script id="pix-json-set-info" type="application/json"> { "albumId": 17327808, "likeStatus": false, "categoryId":"11", "categoryEname":"chongwu", "prevAlbumUrl": "", "nextAlbumUrl": "/detail/52085246?u=60786793" } </script> ''' picId = urlsplit(url).path.split('/')[-1] userId = parse_qs(urlsplit(url).query)['u'][0] albumId = json.loads(soup.find('script', id='pix-json-set-info').string)['albumId'] ajax_url = 'http://wantu.taobao.com/ajax/PicDetailAjax.do?picId=%s&userId=%s&albumId=%s&t=1365154666759&_method=read' ajax_url = ajax_url % (picId, userId, albumId) resp = json.loads(httplib.urlopen(ajax_url)[2].decode('gbk')) picture = resp['data']['models'][0]['picPath'] description = httplib.html_unescape(resp['data']['models'][0]['desc']) return (picture, description)