Example #1
0
    def __init__(self, spider_dir_name):
        if not spider_dir_name:
            raise RuntimeError('必须指定图组名称')

        # 准备好爬虫目录
        self.spider_dir = IOUtils.merge_dir(self.__home_dir__, spider_dir_name)
        IOUtils.create_dir_force(self.spider_dir)
        # 创建目录缓存
        self.path_cache = PathCacheUtils(self.spider_dir)
Example #2
0
    def create_group_dir(self, group_name, group_url):
        group_code = self.get_group_code_and_title(group_name)[0]
        group_dir = self.path_cache.get_path(group_code)
        if group_dir is not None:
            return group_dir

        # 创建目录
        group_dir = IOUtils.merge_dir(self.spider_dir, group_name)
        IOUtils.create_dir_force(group_dir)
        self.path_cache.add_cache(group_name)
        # 保存超链接
        self.save_url_link(group_dir, group_url)
        return group_dir
Example #3
0
def have_big_file(group_path):
    for file_name in os.listdir(group_path):
        file_path = IOUtils.merge_dir(group_path, file_name)
        size = os.path.getsize(file_path)
        if size > 50 * 1000:
            return True
    return False
Example #4
0
 def group_done(self, group_code):
     old_path = self.path_cache.get_path(group_code)
     if not old_path:
         raise RuntimeError('图组不存在')
     group_name = old_path.split(r'/')[-1]
     new_path = IOUtils.merge_dir(self.spider_dir,
                                  self.__done_sign__ + group_name)
     os.rename(old_path, new_path)
     print('图组保存完成 >>> %s' % group_name)
Example #5
0
def have_good_file(group_path):
    bad_md5_list = [
        '7e80fb31ec58b1ca2fb3548480e1b95e', '4cf24fe8401f7ab2eba2c6cb82dffb0e'
    ]
    for file_name in os.listdir(group_path):
        if file_name == 'target.url':
            continue

        file_path = IOUtils.merge_dir(group_path, file_name)

        # 判断md5和图片质量
        if (get_md5(file_path)
                not in bad_md5_list) and (is_good_image(file_path)):
            return True

    return False
Example #6
0
    def file_path(self, request, response=None, info=None):
        item = request.meta['item']
        image_group_url = item['image_group_url']
        image_group_utils = item['image_group_utils']
        group_name = item['image_group_name']

        # 创建图组目录(如果已经存在不会重复创建)
        group_dir = image_group_utils.create_group_dir(group_name, image_group_url)
        # 获取到爬虫根目录后的相对路径
        reg_result = re.findall(r'^' + self.img_store + r'[\\/](.+)$', group_dir)
        if len(reg_result) == 0:
            raise RuntimeError('相对路径获取失败')
        relative_dir = reg_result[0]
        # 图片名
        image_name = self.get_image_name(request.url)
        # 图片地址
        return IOUtils.merge_dir(relative_dir, image_name)
Example #7
0
def runner():
    group_name_list = os.listdir(path)
    group_name_length = len(group_name_list)
    finish_count = 0
    for group_name in group_name_list:
        group_path = IOUtils.merge_dir(path, group_name)

        if is_error_category(group_name):
            remove('无用分类', group_path)
        elif not have_big_file(group_path):
            remove('小文件', group_path)
        elif not have_good_file(group_path):
            remove('低质量', group_path)

        finish_count = finish_count + 1
        print('\r[ %s / %s ] %s >> ' %
              (finish_count, group_name_length, group_name),
              end='',
              flush=True)
    print('运行结束')
Example #8
0
 def remove_group_path(self, group_code):
     group_path = self.path_cache.get_path(group_code)
     IOUtils.remove_dir(group_path)
Example #9
0
 def __get_path_from_disk__(self, group_code):
     sign = '(%s)' % group_code
     for dir_name in os.listdir(self.spider_path):
         if sign in dir_name:
             return IOUtils.merge_dir(self.spider_path, dir_name)
     return None
Example #10
0
 def add_cache(self, group_name):
     group_code = ImageGroupUtils.get_group_code_and_title(group_name)[0]
     self.cache_list[group_code] = IOUtils.merge_dir(
         self.spider_path, group_name)