def download_article(article): name = article['name'] print('开始下载[' + name + ']') save_dir = path.join(save_path, name) if not path.exists(save_dir): os.makedirs(save_dir) pics = get_pics(article['href']) for pic in pics: pic_name = pic['name'] pic_href = pic['href'] save_file = path.join(save_dir, pic_name) if path.exists(save_file) or savepath.check_exists( dir_name, name, pic_name): # print(save_file + '已存在') pass else: try: response = session.get(pic_href, cookies=cookies, verify=False, timeout=(3, 3)) if response.status_code == 200: with open(save_file, 'wb+') as f: f.write(response.content) # print('下载到' + save_file) except Exception as e: print(repr(e))
def download_article(article): print('开始下载[{}]'.format(article)) article_name = article['name'] save_dir = path.join(save_path, escape(article_name)) if not path.exists(save_dir): os.makedirs(save_dir) article_href = article['href'] article_id = article_href.split('/')[-1] num = 1 pic_href = dowload_url + '/' + article_id + '/' + str(num) + '.jpg' try: response = session.get(pic_href, verify=False, timeout=(3, 3)) while response.status_code == 200: save_file = path.join(save_dir, str(num) + '.jpg') if not path.exists(save_file) and not savepath.check_exists( dir_name, escape(article_name), str(num) + '.jpg'): with open(save_file, 'wb+') as f: f.write(response.content) print('下载到' + save_file) else: print(save_file + '已存在') num += 1 pic_href = dowload_url + '/' + article_id + '/' + str(num) + '.jpg' response = session.get(pic_href, verify=False, timeout=(3, 3)) except Exception as e: print(repr(e))
def d_collection(collection): tittle = collection.find_all('p', class_=['biaoti'])[ 0].text.strip().replace('<', '《').replace('>', '》') path = os.path.join(save_path, tittle) if not os.path.exists(path): try: os.makedirs(path) except Exception as e: print(repr(e)) return count_str = collection.find_all( 'span', class_=['shuliang'])[0].text count = int(count_str[0: -1]) print('共有' + str(count) + '张图片') img_url = collection.find_all('img')[0]['src'] prefix = img_url[0: img_url.rfind('/')] # headers = {'Referer': collection.find_all('')} for num in range(1, count + 1): img_save_path = os.path.join(path, str(num) + '.jpg') if os.path.exists(img_save_path) or savepath.check_exists(dir_name, tittle, str(num) + '.jpg'): print('['+img_save_path + ']已存在') continue url = prefix + '/' + str(num) + '.jpg' try: content_response = session.get(url, timeout=3) if content_response.status_code == 200: content = content_response.content with open(img_save_path, 'wb') as f: f.write(content) f.close() print('已下载[' + img_save_path + ']') except Exception as e: print(repr(e)) continue
def download_article(article): name = article['name'] print('开始下载合集[' + name + ']') save_dir = path.join(save_path, name) if not path.exists(save_dir): os.makedirs(save_dir) article_href = article['href'] article_href_head = article_href[0:-5] article_pages = get_article_pages(article_href) print('合集[' + name + ']共有[' + str(article_pages) + ']张图片') for article_page in range(1, article_pages + 1): if article_page != 1: article_href = article_href_head + \ '_' + str(article_page) + '.html' pic = get_pic(article_href) if pic: save_file = path.join(save_dir, pic['name']) if path.exists(save_file) or savepath.check_exists( dir_name, name, pic['name']): print(save_file + ':已存在') else: try: response = session.get(pic['href'], headers=headers, verify=False, timeout=(3, 3)) if response.status_code == 200: with open(save_file, 'wb+') as f: f.write(response.content) print('已下载' + save_file) except Exception as e: print(repr(e)) print('下载' + pic['href'] + '时出错')
def download_collection(c): reffer = c.find_all('a')[0]['href'] headers = {'Referer': reffer} pic_num = int(c.find_all('p', class_=False)[0].text.split(' ')[1]) pic = c.find_all('img')[0] pic_url = pic['src'] prefix = pic_url[0:pic_url.rfind('/')] pic_name = pic['alt'].strip().replace('<', '《').replace('>', '》') \ .replace(':', ':').replace(' ', '') path = os.path.join(save_path, escape(pic_name.strip())) if not os.path.exists(path): os.makedirs(path) for n in range(1, pic_num + 1): pic_save_path = os.path.join(path, str(n) + '.jpg') if os.path.exists(pic_save_path) or savepath.check_exists( dir_name, escape(pic_name.strip()), str(n) + '.jpg'): print('[' + pic_save_path + ']已存在') continue content = None try: content = session.get(prefix + '/' + str(n) + '.jpg', timeout=3, headers=headers).content except Exception as e: print(repr(e)) continue if content: with open(pic_save_path, 'wb') as f: f.write(content) f.close() print('已下载[' + pic_save_path + ']')
def download_article(article): name = article['name'] print('开始下载:' + name) save_dir = path.join(save_path, name) if not path.exists(save_dir): os.makedirs(save_dir) pages = get_pic_pages(article['href']) for p in range(1, pages + 1): url = article['href'] + '?page=' + str(p) pic = get_pic(url) if pic: file_name = path.join(save_dir, pic['name']) if path.exists(file_name) or savepath.check_exists( dir_name, name, pic['name']): print(file_name + ':已存在') else: try: response = session.get(pic['href'], headers=headers, verify=False, timeout=(10, 10)) if response.status_code == 200: with open(file_name, 'wb+') as f: f.write(response.content) print('下载到:' + file_name) except Exception as e: print(repr(e))
def download_article(article): print(article) global download_zip article_href = article['href'] soup = get_soup(article_href) real_articl_name = get_real_article_name(soup) if real_articl_name: article_name = real_articl_name else: article_name = article['name'].strip() if article_name.endswith('.'): article_name = article_name[:-1] save_dir = path.join(save_path, article_name) if not path.exists(save_dir): try: os.makedirs(save_dir) except Exception as e: print(repr(e)) if download_zip: zip = get_zip(soup) if zip: print('获取到zip包:{}'.format(zip)) zip_name = zip['name'] zip_href = zip['href'] zip_file = path.join(save_dir, zip_name) if not savepath.check_exists(dir_name, article_name, zip_name): download(zip_file, zip_href) pics = get_pics(soup) pic_page = 1 article_url = article_href[0:-5] + '_{}.html' while pics: for pic in pics: pic_name = pic['name'] pic_href = pic['href'] pic_file = path.join(save_dir, pic_name) if not savepath.check_exists(dir_name, article_name, pic_name): download(pic_file, pic_href) pic_page += 1 soup = get_soup(article_url.format(pic_page)) pics = get_pics(soup)
def download(article: dict): pics = get_pics(article) dir_name = os.path.join(savepath.save_path, article['title']) if not os.path.exists(dir_name): os.makedirs(dir_name) if pics: for pic in pics: content = get_pic_content(pic) if content: pic_name = pic.split('/')[-1] pic_path = os.path.join(dir_name, pic_name) if not savepath.check_exists(article['title'], pic_name): with open(pic_path, 'wb+') as f: f.write(content)
def get_pics(href): global cookies index = 1 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Connection': 'keep-alive', 'Referer': href } content = parse_url(href, headers=headers, cookies=cookies) div = content.find('div', class_='main-image') if content else None if div: pic = div.find('img') pic_base_url = pic['src'][:-6] pic_title = escape(pic['alt']) print("开始下载{}, {}".format(pic_title, pic['src'])) path = os.path.join(save_path, pic_title) if not os.path.exists(path): os.makedirs(path) while True: pac_save_path = os.path.join(path, str(index) + '.jpg') if os.path.exists(pac_save_path) or savepath.check_exists( dir_name, pic_title, str(index) + '.jpg'): # print(pac_save_path + '已存在!') index += 1 continue else: pic_url = pic_base_url + format(index) + '.jpg' try: get_pic_response = session.get(pic_url, headers=headers, cookies=cookies, timeout=5) if get_pic_response and get_pic_response.status_code == 200: cookies = get_pic_response.cookies pic_content = get_pic_response.content with open(pac_save_path, 'wb+') as f: f.write(pic_content) else: break except Exception as e: print(repr(e)) index += 1
def download_collection(collection): title = escape(collection[0]) p = os.path.join(save_path, title) if not os.path.exists(p): os.makedirs(p) href = collection[1] headers = {'Referer': href} img_url = get_img_start_url(href) if not img_url: return num2 = 1 while True: if savepath.check_exists(dir_name, title, str(num2) + '.jpg'): get_pic_res = True else: get_pic_res = download( img_url + '/' + str(num2) + '.jpg', os.path.join(save_path, title, str(num2) + '.jpg'), headers) if not get_pic_res: break num2 += 1
def download_article(article): global g_cookies print('开始下载:' + article['name']) save_dir = path.join(save_path, escape(article['name'])) if not path.exists(save_dir): os.makedirs(save_dir) pics = get_pics(article) for pic in pics: save_file = path.join(save_dir, pic['name']) if path.exists(save_file) or savepath.check_exists( dir_name, escape(article['name']), pic['name']): # print(save_file + ':已存在!') pass else: new_headers = { 'Referer': pic['referer'], 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0', 'Accept': 'image/webp,*/*', 'Accept-Language': 'zh-CN,en-US;q=0.7,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', } try: response = session.get(pic['href'], headers=new_headers, cookies=g_cookies, timeout=(10, 10)) if response.status_code == 200: g_cookies = response.cookies with open(save_file, 'wb+') as f: f.write(response.content) except Exception as e: print(repr(e)) continue else: print('获取' + pic['href'] + '失败')