def down_pic_include_child(down_path): file_name_list = FileTool.get_file_list('txt') # down_path = down_param['down_file_path'] for index, file_name in enumerate(file_name_list, 1): # print('down the %i file : %s' % (index, file_name)) print('读取第 %i 个文件 : %s' % (index, file_name)) # 打开文件 with open(file_name) as file_obj: for num, value in enumerate(file_obj, 1): line = value.strip('\n') if line == '': print('当前行为空:%i line' % num) continue # print('the %i line: -%s- ' % (num, line), end=';') print('第 %i 行: -%s- ' % (num, line), end=';') # 获取子页面连接 # child_img_url = get_img_child_url(line, pre_url) url_list = get_img_url_list(line) img_urls = url_list[0] # img_urls.extend(child_img_url) total = len(img_urls) # print('duplicate removal image num: %i ' % total) print('去重后图片数量: %i ' % total) new_title = url_list[1] # 保存所有的下载记录 os.chdir(cur_dir) write_to_done_log(line, new_title) if len(img_urls) < 2: os.chdir(cur_dir) save_not_down_url(line, new_title, num) else: path = down_path + cur_month + str(new_title.strip()) + '/' common.create_file(path) os.chdir(path) for i in range(0, len(img_urls)): file_url = img_urls[i].get('file') # print(file_url) # fileUrl = file_url.replace('http://pic.w26.rocks/', pre_url) if not file_url.startswith('http'): # if not ('http://' in file_url or 'https://' in file_url): print('in:' + file_url) file_url = pre_url + file_url image_name = file_url.split("/")[-1] # print(file_url) if not os.path.exists(image_name): # print('the %i line:the %i / %i ge: %s' % (num, i + 1, total, file_url), end=';') print('第 %i 行:第 %i / %i 个: %s' % (num, i + 1, total, file_url), end=';') common.down_img(file_url) # else: # print('the %i line:the %i / %i is exist: %s' % (num, i + 1, total, file_url)) # print('第 %i 行:第 %i / % i个已存在: %s' % (num, i + 1, total, file_url)) # print("-----down over----------------") print('第 %i 行: %s 下载完毕,开始下载下一行文件 ' % (num, line)) print('第 %i 个文件: %s 下载完毕,开始删除...' % (index, file_name)) os.remove(file_name) print('第 %i 个文件: %s 删除成功,开始读取下一个文件' % (index, file_name)) print("----------------所有文件下载完毕-------------------")
def down_all_pic(down_param): path_ = down_param + cur_month if not (os.path.exists(path_)): os.makedirs(path_) file_name_list = FileTool.get_file_list('txt') for index, file_name in enumerate(file_name_list, 1): # print('down the %i file: %s' % (index, file_name)) print('读取第 %i 个文件: %s' % (index, file_name)) # 打开文件 with open(file_name) as file_obj: for num, value in enumerate(file_obj, 1): line = value.strip('\n') if line == '': print('当前行为空:%i line' % num) continue # print('the %i line: -%s- ' % (num, line), end=' ;') print('第 %i 行: -%s- ' % (num, line), end=' ;') # 获取所有图片连接 url_list = get_img_url_list(line) img_urls = url_list[0] # print(' image num: %i ' % l) print(' 图片数量: %i ' % len(img_urls)) new_title = url_list[1] if len(img_urls) < 2: os.chdir(cur_dir) save_not_down_url(line, new_title, num) else: path = path_ + str(new_title.strip()) + os.sep common.create_file(path) os.chdir(path) for i in range(0, len(img_urls)): file_url = img_urls[i].get('file') # if not ('http://' in file_url or 'https://' in file_url): if not file_url.startswith('http'): print('in:' + file_url) file_url = pre_url + file_url # fileUrl = file_url.replace('http://pic.w26.rocks/', pre_url) image_name = file_url.split("/")[-1] if not os.path.exists(image_name): # print('the %i line:the %i / %i ge : %s' % (num, i + 1, l, file_url), end=' ;') print('第 %i 行:第 %i / %i 个 : %s' % (num, i + 1, len(img_urls), file_url), end=' ;') common.down_img(file_url) # print("-----down over----------------") print('第 %i 行: %s 下载完毕 ' % (num, line)) # 保存所有的下载链接 os.chdir(cur_dir) write_to_done_log(line, new_title) print('第 %i 个文件: %s 下载完毕,开始删除...' % (index, file_name)) os.remove(file_name) print('第 %i 个文件: %s 删除成功,开始读取下一个文件' % (index, file_name), end=";") # print("down all over----------------start delete old undown text-------------------") print("---------------- 所有文件下载完毕 -------------------")
def write_to_done_log(line, new_title, dir_path=os.getcwd()): done_file_list = FileTool.get_appoint_file( 'done-' + common.get_datetime('%Y-%m') + '.log', dir_path) # print(dir_path) if len(done_file_list) == 0: done_log = dir_path + 'done-' + common.get_datetime('%Y-%m') + '.log' else: done_log = done_file_list[0] logger.info("保存已下载图片链接--->>>done.log: " + done_log) # os.chdir(os.getcwd()) with open(done_log, 'a+', encoding='utf-8') as f: f.write('%s:[%s,%s]\n' % (common.get_datetime('%Y/%m/%d %H:%M'), line.strip('\n'), new_title))
def save_not_down_url(dir_path, line, new_title, num): name_list = FileTool.get_appoint_file( 'un_done-' + common.get_datetime('%Y-%m') + '.log', dir_path) # print(dir_path) if len(name_list) == 0: file_name = dir_path + 'un_done-' + common.get_datetime( '%Y-%m') + '.log' else: file_name = name_list[0] logger.info('un_down_file:' + file_name) # os.chdir(dir_path) with open(file_name, 'a+', encoding='utf-8') as f: # f.write('第' + str(num) + '行:' + line + ',' + new_title + '\n') f.write('%s:[%s,%s]\n' % (common.get_datetime('%Y/%m/%d %H:%M'), line, new_title))
def dow_img_from_file(file_name, category_name): # 获取路径 dir_path = FileTool.get_classify_dir_path(category_name) un_down = [] with open(file_name) as file_obj: # logger.info('open file........') for num, value in enumerate(file_obj, 1): # logger.info('read file........') line = value.strip('\n') if line == '': logger.info('当前行为空:%i line' % num) continue logger.info(line) # 获取所有图片连接 url_list = get_img_url_list(line) img_urls = url_list[0] logger.info('第 %i 行: -%s- ; 图片数量: %i ' % (num, line, len(img_urls))) new_title = str(url_list[1]) logger.info(new_title) # 图片数量小于2,且 不包含删 字段,才保存 if len(img_urls) < 2: if '删' not in new_title: save_not_down_url(dir_path, line, new_title, num) continue elif '删' in new_title: continue else: path = create_down_root_path( category_name, str(new_title.strip( ))) # path_ + str(new_title.strip()) + os.sep os.chdir(path) fs = [] start = time.time() need_down = [] for i in range(0, len(img_urls)): file_url = img_urls[i].get('file') if not file_url.startswith('http'): logger.info('in:' + file_url) file_url = pre_url + file_url image_name = file_url.split("/")[-1] if not os.path.exists(image_name): temp_map = {'file_url': file_url, 'path': path} need_down.append(temp_map) else: logger.info('第 %i 行: -%s- ;url:%s;文件 %s已存在 ' % (num, line, file_url, image_name)) if len(need_down) > 0: with futures.ThreadPoolExecutor( max_workers=5 if len(need_down) > 5 else len(need_down), thread_name_prefix="down-thread") as executor: for index1, value1 in enumerate(need_down, 1): submit = executor.submit( DownTool.future_dowm_img, value1['file_url'], ProxyIp.ProxyIp().get_random_proxy_ip(), num, len(need_down), index1, value1['path']) # submit完成之后的回调函数 submit.add_done_callback(common.executor_callback) fs.append(submit) futures.wait(fs, timeout=15) logger.info('第 %i 行: %s 下载完毕;用时:%i ' % (num, line, time.time() - start)) # 保存所有的下载链接 os.chdir(cur_dir) write_to_done_log(line, new_title, dir_path)
def get_down_map(self, arg='filter', file_type='text'): file_dir = self.get_file_dir(arg) return FileTool.get_file_map(file_dir, file_type)
def get_down_file(self, arg='filter'): file_dir = self.get_file_dir(arg) return FileTool.get_file_list('text', file_dir)
# from furl import furl # from pypinyin import pinyin, lazy_pinyin, Style import logging import os # import threading import p**n from common import FileTool logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) if __name__ == '__main__': file_map = FileTool.get_file_map(os.getcwd() + os.sep, 'txt', 2, '-', 0) # 循环分组后的文件列表 for category_name, file_list in file_map.items(): # 获取路径 # dir_path = FileTool.get_classify_dir_path(key) print(category_name) # 虚幻文件列表 for index, file_name in enumerate(file_list, 1): p**n.dow_img_from_file(file_name, category_name) print() logger.info('第 %i 个文件: %s 下载完毕,开始删除...' % (index, file_name)) print() os.remove(file_name) if index == len(file_list): logger.info("------------删除成功,所有文件下载完毕------------------") else: