예제 #1
0
def down_pic_include_child(down_path):
    file_name_list = FileTool.get_file_list('txt')
    # down_path = down_param['down_file_path']
    for index, file_name in enumerate(file_name_list, 1):
        # print('down the %i file : %s' % (index, file_name))
        print('读取第 %i 个文件 : %s' % (index, file_name))
        # 打开文件
        with open(file_name) as file_obj:
            for num, value in enumerate(file_obj, 1):
                line = value.strip('\n')
                if line == '':
                    print('当前行为空:%i line' % num)
                    continue
                # print('the %i line: -%s-  ' % (num, line), end=';')
                print('第 %i 行: -%s-  ' % (num, line), end=';')
                # 获取子页面连接
                # child_img_url = get_img_child_url(line, pre_url)
                url_list = get_img_url_list(line)
                img_urls = url_list[0]
                # img_urls.extend(child_img_url)
                total = len(img_urls)
                # print('duplicate removal image num: %i ' % total)
                print('去重后图片数量: %i ' % total)
                new_title = url_list[1]
                # 保存所有的下载记录
                os.chdir(cur_dir)
                write_to_done_log(line, new_title)
                if len(img_urls) < 2:
                    os.chdir(cur_dir)
                    save_not_down_url(line, new_title, num)
                else:
                    path = down_path + cur_month + str(new_title.strip()) + '/'
                    common.create_file(path)
                    os.chdir(path)
                    for i in range(0, len(img_urls)):
                        file_url = img_urls[i].get('file')
                        # print(file_url)
                        # fileUrl = file_url.replace('http://pic.w26.rocks/', pre_url)
                        if not file_url.startswith('http'):
                            # if not ('http://' in file_url or 'https://' in file_url):
                            print('in:' + file_url)
                            file_url = pre_url + file_url
                        image_name = file_url.split("/")[-1]
                        # print(file_url)
                        if not os.path.exists(image_name):
                            # print('the %i line:the %i  / %i ge: %s' % (num, i + 1, total, file_url), end=';')
                            print('第 %i 行:第 %i  / %i 个: %s' %
                                  (num, i + 1, total, file_url),
                                  end=';')
                            common.down_img(file_url)
                        # else:
                        # print('the %i line:the %i  / %i is exist: %s' % (num, i + 1, total, file_url))
                        # print('第 %i 行:第 %i  / % i个已存在: %s' % (num, i + 1, total, file_url))
                # print("-----down over----------------")
                print('第 %i 行: %s 下载完毕,开始下载下一行文件  ' % (num, line))
        print('第 %i 个文件: %s 下载完毕,开始删除...' % (index, file_name))
        os.remove(file_name)
        print('第 %i 个文件: %s 删除成功,开始读取下一个文件' % (index, file_name))
    print("----------------所有文件下载完毕-------------------")
예제 #2
0
def down_all_pic(down_param):
    path_ = down_param + cur_month
    if not (os.path.exists(path_)):
        os.makedirs(path_)
    file_name_list = FileTool.get_file_list('txt')
    for index, file_name in enumerate(file_name_list, 1):
        # print('down the %i file: %s' % (index, file_name))
        print('读取第 %i 个文件: %s' % (index, file_name))
        # 打开文件
        with open(file_name) as file_obj:
            for num, value in enumerate(file_obj, 1):
                line = value.strip('\n')
                if line == '':
                    print('当前行为空:%i line' % num)
                    continue
                # print('the %i line: -%s- ' % (num, line), end=' ;')
                print('第 %i 行: -%s- ' % (num, line), end=' ;')
                # 获取所有图片连接
                url_list = get_img_url_list(line)
                img_urls = url_list[0]
                # print(' image num: %i ' % l)
                print(' 图片数量: %i ' % len(img_urls))
                new_title = url_list[1]

                if len(img_urls) < 2:
                    os.chdir(cur_dir)
                    save_not_down_url(line, new_title, num)
                else:
                    path = path_ + str(new_title.strip()) + os.sep
                    common.create_file(path)
                    os.chdir(path)
                    for i in range(0, len(img_urls)):
                        file_url = img_urls[i].get('file')
                        # if not ('http://' in file_url or 'https://' in file_url):
                        if not file_url.startswith('http'):
                            print('in:' + file_url)
                            file_url = pre_url + file_url
                        # fileUrl = file_url.replace('http://pic.w26.rocks/', pre_url)
                        image_name = file_url.split("/")[-1]
                        if not os.path.exists(image_name):
                            # print('the %i line:the %i  / %i ge : %s' % (num, i + 1, l, file_url), end=' ;')
                            print('第 %i 行:第 %i / %i 个 : %s' %
                                  (num, i + 1, len(img_urls), file_url),
                                  end=' ;')
                            common.down_img(file_url)
                # print("-----down over----------------")
                print('第 %i 行: %s 下载完毕 ' % (num, line))
                # 保存所有的下载链接
                os.chdir(cur_dir)
                write_to_done_log(line, new_title)
        print('第 %i 个文件: %s 下载完毕,开始删除...' % (index, file_name))
        os.remove(file_name)
        print('第 %i 个文件: %s 删除成功,开始读取下一个文件' % (index, file_name), end=";")
    # print("down all over----------------start delete old undown text-------------------")
    print("---------------- 所有文件下载完毕 -------------------")
예제 #3
0
def write_to_done_log(line, new_title, dir_path=os.getcwd()):
    done_file_list = FileTool.get_appoint_file(
        'done-' + common.get_datetime('%Y-%m') + '.log', dir_path)
    # print(dir_path)
    if len(done_file_list) == 0:
        done_log = dir_path + 'done-' + common.get_datetime('%Y-%m') + '.log'
    else:
        done_log = done_file_list[0]
        logger.info("保存已下载图片链接--->>>done.log:   " + done_log)
    # os.chdir(os.getcwd())
    with open(done_log, 'a+', encoding='utf-8') as f:
        f.write('%s:[%s,%s]\n' % (common.get_datetime('%Y/%m/%d %H:%M'),
                                  line.strip('\n'), new_title))
예제 #4
0
def save_not_down_url(dir_path, line, new_title, num):
    name_list = FileTool.get_appoint_file(
        'un_done-' + common.get_datetime('%Y-%m') + '.log', dir_path)
    # print(dir_path)
    if len(name_list) == 0:
        file_name = dir_path + 'un_done-' + common.get_datetime(
            '%Y-%m') + '.log'
    else:
        file_name = name_list[0]
        logger.info('un_down_file:' + file_name)
    # os.chdir(dir_path)
    with open(file_name, 'a+', encoding='utf-8') as f:
        # f.write('第' + str(num) + '行:' + line + ',' + new_title + '\n')
        f.write('%s:[%s,%s]\n' %
                (common.get_datetime('%Y/%m/%d %H:%M'), line, new_title))
예제 #5
0
def dow_img_from_file(file_name, category_name):
    # 获取路径
    dir_path = FileTool.get_classify_dir_path(category_name)
    un_down = []
    with open(file_name) as file_obj:
        # logger.info('open file........')
        for num, value in enumerate(file_obj, 1):
            # logger.info('read file........')
            line = value.strip('\n')
            if line == '':
                logger.info('当前行为空:%i line' % num)
                continue
            logger.info(line)
            # 获取所有图片连接
            url_list = get_img_url_list(line)
            img_urls = url_list[0]
            logger.info('第 %i 行: -%s- ; 图片数量: %i ' %
                        (num, line, len(img_urls)))
            new_title = str(url_list[1])
            logger.info(new_title)
            # 图片数量小于2,且 不包含删 字段,才保存
            if len(img_urls) < 2:
                if '删' not in new_title:
                    save_not_down_url(dir_path, line, new_title, num)
                    continue
                elif '删' in new_title:
                    continue
            else:
                path = create_down_root_path(
                    category_name, str(new_title.strip(
                    )))  # path_ + str(new_title.strip()) + os.sep
                os.chdir(path)
                fs = []
                start = time.time()
                need_down = []
                for i in range(0, len(img_urls)):
                    file_url = img_urls[i].get('file')
                    if not file_url.startswith('http'):
                        logger.info('in:' + file_url)
                        file_url = pre_url + file_url
                    image_name = file_url.split("/")[-1]
                    if not os.path.exists(image_name):
                        temp_map = {'file_url': file_url, 'path': path}
                        need_down.append(temp_map)
                    else:
                        logger.info('第 %i 行: -%s- ;url:%s;文件 %s已存在 ' %
                                    (num, line, file_url, image_name))
                if len(need_down) > 0:
                    with futures.ThreadPoolExecutor(
                            max_workers=5
                            if len(need_down) > 5 else len(need_down),
                            thread_name_prefix="down-thread") as executor:
                        for index1, value1 in enumerate(need_down, 1):
                            submit = executor.submit(
                                DownTool.future_dowm_img, value1['file_url'],
                                ProxyIp.ProxyIp().get_random_proxy_ip(), num,
                                len(need_down), index1, value1['path'])
                            # submit完成之后的回调函数
                            submit.add_done_callback(common.executor_callback)
                            fs.append(submit)
                    futures.wait(fs, timeout=15)
            logger.info('第 %i 行: %s 下载完毕;用时:%i ' %
                        (num, line, time.time() - start))
            # 保存所有的下载链接
            os.chdir(cur_dir)
            write_to_done_log(line, new_title, dir_path)
예제 #6
0
 def get_down_map(self, arg='filter', file_type='text'):
     file_dir = self.get_file_dir(arg)
     return FileTool.get_file_map(file_dir, file_type)
예제 #7
0
 def get_down_file(self, arg='filter'):
     file_dir = self.get_file_dir(arg)
     return FileTool.get_file_list('text', file_dir)
예제 #8
0
# from furl import furl
# from pypinyin import pinyin, lazy_pinyin, Style
import logging
import os

# import threading
import p**n
from common import FileTool

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

if __name__ == '__main__':
    file_map = FileTool.get_file_map(os.getcwd() + os.sep, 'txt', 2, '-', 0)
    # 循环分组后的文件列表
    for category_name, file_list in file_map.items():
        # 获取路径
        # dir_path = FileTool.get_classify_dir_path(key)
        print(category_name)
        # 虚幻文件列表
        for index, file_name in enumerate(file_list, 1):
            p**n.dow_img_from_file(file_name, category_name)
            print()
            logger.info('第 %i 个文件: %s 下载完毕,开始删除...' % (index, file_name))
            print()
            os.remove(file_name)
            if index == len(file_list):
                logger.info("------------删除成功,所有文件下载完毕------------------")
            else: