def worker_process_pic(gearman_worker, gearman_job): url = gearman_job.data if wantudal.is_processed(url): logging.debug('%s is skipped as it was processed already' % (url)) return logging.debug('processing %s' % (url)) try: content = httplib.urlopen(url)[2] except HTTPError, e: logging.debug('http error: %s' % (e.code)) return
def worker_process_html(gearman_worker, gearman_job): url = gearman_job.data if wantudal.is_processed(url): logging.debug('%s is skipped as it was processed already' % (url)) return logging.debug('processing %s' % (url)) #this web is encoded by gbk html_doc = httplib.urlopen(url)[2].decode('gbk') soup = BeautifulSoup(html_doc, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) if is_album_index(url): albums = get_album_from_index(soup) logging.debug('%d albums are found' % (len(albums))) more_indexes = find_next_indexes(soup) logging.debug('%d sub indexes are found' % (len(more_indexes))) wantudal.save_url(url, pagetype=wantudal.PageType.AlbumIndex, isfinished=1) for album in albums: wantudal.save_url(album, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0) submit_html_job(album) for index in more_indexes: wantudal.save_url(index, pagetype=wantudal.PageType.AlbumIndex, referrerurl=url, isfinished=0) submit_html_job(index) elif is_album_page(url): details = get_detail_from_album(soup, url) logging.debug('%d details are found' % (len(details))) more_indexes = find_next_indexes(soup) logging.debug('%d sub indexes are found' % (len(more_indexes))) wantudal.save_url(url, pagetype=wantudal.PageType.AlbumPage, isfinished=1) for detail in details: wantudal.save_url(detail, pagetype=wantudal.PageType.DetailPage, referrerurl=url, isfinished=0) submit_html_job(detail) for index in more_indexes: wantudal.save_url(index, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0) submit_html_job(index) elif is_detail_page(url): pic, description = get_content_from_detail(soup, url) wantudal.save_url(url, pagetype=wantudal.PageType.DetailPage, isfinished=1) wantudal.save_url(pic, pagetype=wantudal.PageType.PicturePage, referrerurl=url, description=description, isfinished=0) submit_pic_job(pic) else: logging.debug('unknown resource')