def find_image_urls(page_url): # 查找某个图片页面的所有图片URL soup = commons.soup(page_url, encoding='gbk') # print('process page', page_url) images = soup.find_all(image_url_pattern) return [img.get('src') for img in images] return filter(None, images)
def find_album_urls_by_tag(url): # 查找某个TAG页面的所有专辑URL # print('finding albums in {0}'.format(url)) soup = commons.soup(url, encoding='gbk') pages = soup.find_all(page_url_pattern) return [p.get('href') for p in pages] return filter(None, pages)
def get_taotu_pages(category_url): # 找到某个分类下全部的分页URL print('process category: {0}'.format(category_url)) soup = commons.soup(category_url, encoding='utf8') print('process index: {0}'.format(soup.title)) last_no = get_last_page_no(soup) urls = ['{0}/list_{1}.html'.format(category_url, i) for i in range(2, last_no + 1)] # for url in urls: # download_by_page(url) retry = 0 while True: pool = ThreadPool(4) try: pool.map(download_by_page, urls) pool.close() pool.join() print('all images downloaded completely.') break except KeyboardInterrupt, e: print('download terminated by user, quit now.', e) pool.terminate() pool.join() break except Exception, e: pool.terminate() pool.join() retry += 1 traceback.print_exc() try: print('download error: {0}, {1} retry in {2}s'.format( e, retry, retry * 20 % 120)) except Exception: pass time.sleep(retry * 20 % 120)
def get_image_urls_for_taotu(url): # 找套图某一个页面包含的图片URL soup = commons.soup(url, encoding='utf8') imgs = soup.select('#big-pic')[0].find_all('img') urls = [img.get('src') for img in imgs] print('found {0} images in {1}'.format(len(urls), url)) return urls
def print_css_links(url): soup = commons.soup(url) raw_css_urls = [ link["href"] for link in soup.findAll("link") if "stylesheet" in link.get("rel", []) ] css_urls = [ u'https:%s' % url if url.startswith(u'//') else url for url in raw_css_urls ]
def find_album_pages(album_url): # 查找某个专辑页面的所有分页URL soup = commons.soup(album_url, encoding='gbk') title = soup.title album = os.path.basename(album_url)[:-4] print('process album', album_url) links = soup.find_all(href=re.compile('{0}'.format(album))) page_url_pattern = album_url.replace(album, '{0}')[:-4] # print(page_url_pattern) pages = [page_url_pattern.format(l.get('href')) for l in links] return title.text, set(filter(None, pages))
def find_all_tugua_urls(page): url = url_tpl.format(page) print('fetch {0}'.format(url)) soup = commons.soup(url, encoding='gbk') links = soup.find_all(filter_page_url) if links: print('fetch {0} urls in page {1}'.format(len(links), page)) lock.acquire() urls.extend(links) lock.release() return links
def get_all_tags(): # 获取所有标签,去重 if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) tags_file = os.path.join(OUTPUT_DIR, 'tags.dat') if os.path.isfile(tags_file): with open(tags_file, 'rb') as f: print('found tags cache, skip fetch remote tags') return pickle.load(f) # get tags from tags page url = 'http://www.umei.cc/tags/' soup = commons.soup(url, encoding='gbk') urls = soup.find_all(href=tag_pattern) tags = [tag_pattern.match(a.get('href')).group(1) for a in urls] # get tags from index page url = 'http://www.umei.cc/' soup = commons.soup(url, encoding='gbk') urls = soup.find_all(href=tag_pattern) tags.extend([tag_pattern.match(a.get('href')).group(1) for a in urls]) with open(tags_file, 'wb') as f: pickle.dump(tags, f) return sorted(set(tags))
def download_page(item): id = item['id'] url = page_tpl.format(id) # 如果已下载,跳过 filename = os.path.join(OUTPUT, '{}.html'.format(id)) if os.path.exists(filename): print('skip page {0}'.format(url)) return print('download page: {0}'.format(url)) # 必须用gbk,要不然繁体乱码 # 虽然网页上写的是gb2312,但是浏览器实际使用的是gbk soup = commons.soup(url, encoding='gbk') # 获取所有的图片URL imgs = soup.find_all('img') # 图片保存目录 img_dirname = 'images_{0}'.format(id) imgdir = os.path.join(OUTPUT, img_dirname) if not os.path.exists(imgdir): os.mkdir(imgdir) # 逐个下载图片 for img in imgs: from_src = img['src'] # 跳过没有扩展名的图片 if not os.path.splitext(from_src)[1]: continue if not from_src.startswith('http://'): # 给部分不是完整的图片URL添加域名部分 from_src = 'http://www.dapenti.com/blog/{0}'.format(from_src) # 过滤不合法的文件名字符 to_src = commons.get_safe_filename(from_src) imgfile = os.path.join(imgdir, to_src) # 替换为本地图片链接 img['src'] = os.path.join(img_dirname, to_src) if os.path.exists(imgfile): # 跳过已存在的图片 print('skip exists image {0}'.format(from_src)) else: # 不存在则下载 iurl, iname = download_image(from_src, imgfile, id) if not iname: # 如果图片无法下载,保留原始URL img['src'] = from_src tempfile = '{0}.tmp'.format(filename) # 如果正文和图片都下载完成,没有错误,则保存到文件 with open(tempfile, 'w') as f: # 用utf写入文件,所以html头的gb2312需要改为utf8 content = unicode(soup).replace('charset=gb2312', 'charset=utf-8') f.write(content.encode('utf8')) commons.safe_rename(tempfile, filename) print('page saved {0}'.format(url))
def download_taotu_images(turl, output=OUTPUT): # 下载某个套图的全部图片 # http://www.aitaotu.com/guonei/5044.html # http://www.aitaotu.com/guonei/5044_10.html base_url = os.path.dirname(turl) page_no = os.path.basename(turl)[:-5] # 图片保存目录,用套图的No序号做目录名字 img_dir = os.path.join(output, page_no) if not os.path.exists(img_dir): os.makedirs(img_dir) # 如果发现 .dat 文件,表明这个套图已经全部下载完成,跳过 stat_file = os.path.join(output, '{0}.dat'.format(page_no)) if os.path.isfile(stat_file): print('skip done page: {0}'.format(turl)) return print('downloading page: {0}'.format(turl)) soup = commons.soup(turl, encoding='utf8') # print('process page: {0}'.format(soup.title)) images = [] # 找到这个套图所有页面的图片URL for i in range(2, get_last_page_no(soup) + 1): purl = '{0}/{1}_{2}.html'.format(base_url, page_no, i) images.extend(get_image_urls_for_taotu(purl)) # 逐个下载图片 for iurl in images: img_name = os.path.basename(iurl) img_file = os.path.join(img_dir, img_name) if os.path.isfile(img_file): # 如果图片已存在,跳过 print('{0} skip image {1}'.format(page_no, img_file)) else: # 不存在则下载 print('{0} downloading {1}'.format(page_no, iurl)) commons.download_file(iurl, img_file) # 没有发生异常全部下载完成,则保存状态文件 with open(stat_file, 'wb') as f: pickle.dump(images, f) print('downloaded, save stat {0}'.format(turl))
def print_css_links(url): soup = commons.soup(url) raw_css_urls = [link["href"] for link in soup.findAll("link") if "stylesheet" in link.get("rel", [])] css_urls = [u'https:%s' % url if url.startswith(u'//') else url for url in raw_css_urls]
def find_album_urls_by_index(url): # 查找某个列表页面的所有专辑URL # print('finding albums in {0}'.format(url)) soup = commons.soup(url, encoding='gbk') pages = soup.find_all(page_url_pattern) return [urljoin(DOMAIN, p.get('href')) for p in pages]
def get_taotu_urls_for_page(url): # 找到某个页面包含的全部套图URL soup = commons.soup(url, encoding='utf8') links = soup.find_all(filter_taotu_url) return [urljoin(DOMAIN, l.get('href')) for l in links]