Exemple #1
0
def is_special_illust_ids(illust_path: str = None, **kwargs) -> bool:
    if not kwargs.get('user_id') and not kwargs.get('illust_id'):
        log.error('The user_id or illust_id is empty.')
        return False
    user_id = kwargs.get('user_id')
    cache_illust_ids_path = os.path.dirname(__file__)
    cache_illust_ids_path = os.path.join(
        cache_illust_ids_path,
        r'.\cache\\' + str(user_id) + '-illust-ids.json')
    if not os.path.isfile(cache_illust_ids_path):
        # 某个用户的illust_id
        illust_ids = session.query(Illustration).filter(Illustration.user_id == user_id)\
            .order_by(Illustration.total_bookmarks.desc()).all()
        illust_ids = [x.id for x in illust_ids]
        log.info('query user_id: {}, illust_ids_size: {}'.format(
            user_id, len(illust_ids)))
        json.dump(illust_ids,
                  open(cache_illust_ids_path, 'w', encoding='utf-8'),
                  ensure_ascii=False,
                  indent=4)
    else:
        illust_ids = json.load(
            open(cache_illust_ids_path, 'r', encoding='utf-8'))
    current_illust_id = get_illust_id(illust_path)
    return current_illust_id in illust_ids
Exemple #2
0
def update_dir_illust_tag(directory: str, tag: str):
    """
    将某个文件夹下的所有文件在illust数据库中的记录标记tag
    :param directory: 目标文件夹
    :param tag: 某个类型的标记名称,
               ignore: 校验过不需要的插画
               downloaded: 已经下载的图片
               small: 图片太小
               delete: 直接删除
               too_long: 太长啦,一帮是那种漫画
               gray: 黑白插画
    :return: None
    """
    if not os.path.exists(directory):
        log.error('The directory is not exist: {}'.format(directory))
        return
    illust_files = os.listdir(directory)
    for illust_file in illust_files:
        # 获取目录或者文件的路径
        if os.path.isdir(os.path.join(directory, illust_file)):
            continue
        log.info('process file: ' + illust_file)
        # 提取 illust_id
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(illust_file))
            continue
        update_illustration_tag(illust_id, tag)
        # os.remove(os.path.join(directory, illust_file))
    log.info('process end. total illust size: {}'.format(len(illust_files)))
def get_directory_illusts(illust_directory) -> list:
    """
    获取某个文件夹下的所有插画,适用于pixiv插画
    :param illust_directory: 插画路径
    :return: 插画信息列表
    """
    illusts = []
    if not os.path.isdir(illust_directory):
        log.error(
            'The illust directory is not exist: {}'.format(illust_directory))
        return illusts
    illust_files = os.listdir(illust_directory)
    for illust_file in illust_files:
        illust_file = os.path.join(illust_directory, illust_file)
        if os.path.isdir(illust_file):
            log.info('The file is directory: {}'.format(illust_file))
            continue
        illust_id = get_illust_id(illust_file)
        if illust_id is None:
            log.info('The file illust_id is None: {}'.format(illust_file))
            continue
        illusts.append({
            'illust_id': illust_id,
            'path': os.path.abspath(illust_file)
        })
    log.info('read all illusts success. size: {}'.format(len(illusts)))
    return illusts
Exemple #4
0
def download_task_by_user_id(user_id=None,
                             illust_id=None,
                             save_dir=None,
                             check_download=True,
                             **kwargs):
    # 通过插画id查询对应的用户id
    if illust_id is not None:
        illust: Illustration = session.query(Illustration).get(illust_id)
        if illust is not None:
            user_id = illust.user_id

    # 如果给定了文件夹,一般是补充该用户的插画,尝试从文件夹中解析user_id
    if user_id is None and save_dir is not None:
        parse_user_id = get_illust_id(save_dir)
        if parse_user_id >= 0:
            user_id = parse_user_id

    if user_id is None:
        log.error('The user_id is not valid.')
        return

    # 如果check_download=true,则不再下载,如果是补充下载要设为false
    if check_download and is_download_user(user_id):
        log.warn('The user hase been download. user_id: {}'.format(user_id))
        return

    if save_dir is None:
        # 未给定用户文件夹,则新建一个
        save_dir = os.path.join(r'.\result\by-user', str(user_id))
    download_by_user_id(save_dir,
                        user_id,
                        skip_download=False,
                        skip_max_page_count=10,
                        split_r_18=False,
                        **kwargs)
Exemple #5
0
def check_user_id(source_dir: str,
                  user_dir: str,
                  user_id=None,
                  keep_source=True,
                  use_cache=True,
                  replace_user_file=False):
    """
    检查和移动某个用户下的图片到目标文件夹
    :param user_id: 指定用户id
    :param source_dir: 需要处理的文件夹
    :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹
    :param keep_source: 是否保留原来的文件,如果存在重复的时候生效
    :param use_cache: 是否使用缓存中的文件目录
    :param replace_user_file: 是否替换掉用户文件夹中的文件
    :return:
    """
    if not os.path.isdir(user_dir):
        log.error(
            'The user directory is not exist. directory: {}'.format(user_dir))
        return None

    parse_user_id = get_illust_id(user_dir)
    if user_id is None and parse_user_id >= 0:
        user_id = parse_user_id

    image_meta_infos = get_image_meta_infos(source_dir, use_cache)
    log.info('total image file size: {}'.format(len(image_meta_infos)))

    index = 0
    move_file_size = 0
    for image_meta_info in image_meta_infos:
        index += 1
        # if index % 1000 == 0:
        #     log.info('processed file size: {}'.format(index))
        if image_meta_info.get('user_id') != user_id:
            continue

        if not os.path.isfile(image_meta_info.get('path')):
            log.info('The file was delete. path: {}'.format(
                image_meta_info.get('path')))
            continue

        log.info('The illust({}) is belong user_id({}).'.format(
            image_meta_info.get('illust_id'), user_id))
        move_target_path = os.path.join(user_dir,
                                        image_meta_info.get('file_name'))
        if os.path.isfile(move_target_path):
            log.warn('The target user illust is exist: {}, keep: {}'.format(
                move_target_path, keep_source))
            if keep_source:
                continue

        move_file_size += 1
        if replace_user_file:
            log.info('begin move file from: {} to : {}'.format(
                image_meta_info.get('path'), move_target_path))
            os.replace(image_meta_info.get('path'), move_target_path)
    log.info('end check user_id, hit file size: {}, dir: {}'.format(
        move_file_size, user_dir))
Exemple #6
0
def extract_top(illust_path: str, count: int):
    if not os.path.isdir(illust_path):
        log.error('The illust path is not exist: {}'.format(illust_path))
        return
    illust_files = os.listdir(illust_path)
    log.info('The illust size is: {}'.format(len(illust_files)))

    # top子文件夹
    top_directory = os.path.join(illust_path, 'top')
    if not os.path.isdir(top_directory):
        log.info('create top directory: {}'.format(top_directory))
        os.makedirs(top_directory)

    # 查询子文件夹下的所有插画信息
    illustrations: [Illustration] = []
    for illust_file in illust_files:
        if os.path.isdir(illust_file):
            log.info('The file is directory: {}'.format(illust_file))
            continue
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.error('The illust_id is is not exist: {}'.format(illust_file))
            continue
        illustrations.append(session.query(Illustration).get(illust_id))

    # 按照收藏倒序排序,并取前面 count 个
    illustrations.sort(key=lambda x: x.total_bookmarks, reverse=True)
    illustrations = illustrations[:count]
    top_illust_ids = set(x.id for x in illustrations)
    log.info('The top illust ids is: {}'.format(top_illust_ids))

    # 将top收藏的插画移动到top文件夹
    for illust_file in illust_files:
        if get_illust_id(illust_file) in top_illust_ids:
            log.info('ready move top file: {}'.format(illust_file))
            source_file_path = os.path.join(illust_path, illust_file)
            source_file_path = os.path.abspath(source_file_path)
            move_target_path = os.path.join(top_directory, illust_file)
            move_target_path = os.path.abspath(move_target_path)
            log.info('move file: {} --> {}'.format(source_file_path, move_target_path))
            os.replace(source_file_path, move_target_path)
Exemple #7
0
def get_image_meta_infos(target_directory: str, use_cache=True) -> list:
    cache_file_path = get_cache_path(target_directory, 'meta-info', 'json')
    cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                   cache_file_path)
    if use_cache and os.path.isfile(cache_file_path):
        return json.load(open(cache_file_path, 'r', encoding='utf-8'))
    image_meta_infos = []

    image_paths = get_all_image_paths(target_directory, use_cache)
    log.info('total image file size: {}'.format(len(image_paths)))
    index = 0
    for image_path in image_paths:
        index += 1
        illust_id = get_illust_id(image_path)
        # log.info('get illust_id: {} ({}/{})'.format(illust_id, index, len(image_paths)))

        if illust_id < 0:
            log.warn(
                'The illust is not format. image_path: {}'.format(image_path))
            continue

        if not os.path.isfile(image_path):
            log.warn(
                'The illust was deleted. image_path: {}'.format(image_path))
            continue

        illustration: Illustration = session.query(Illustration).get(illust_id)
        if illustration is None:
            log.warn('The illustration is not exist. illust_id: {}'.format(
                illust_id))
            continue

        image_meta_infos.append({
            'width': illustration.width,
            'height': illustration.height,
            'path': image_path,
            'file_name': os.path.split(image_path)[1],
            'illust_id': illust_id,
            'user_id': illustration.user_id,
            'size': os.path.getsize(image_path),
            'r_18': illustration.r_18,
            'bookmarks': illustration.total_bookmarks,
            'tag': illustration.tag
        })
    log.info('get_image_meta_infos end. image size: {}'.format(
        len(image_meta_infos)))
    json.dump(image_meta_infos,
              open(cache_file_path, 'w', encoding='utf-8'),
              ensure_ascii=False,
              indent=4)
    return image_meta_infos
Exemple #8
0
def download_by_user_id(save_directory,
                        user_id: int,
                        min_total_bookmarks=5000,
                        **kwargs):
    log.info('begin download illust by user_id: {}'.format(user_id))
    illustrations: [Illustration] = session.query(Illustration)\
        .filter(Illustration.user_id == user_id)\
        .filter(Illustration.total_bookmarks >= min_total_bookmarks)\
        .order_by(Illustration.total_bookmarks.desc()).all()
    if illustrations is None or len(illustrations) <= 0:
        log.warn('The illustrations is empty. user_id: {}'.format(user_id))
        return

    if not os.path.isdir(save_directory):
        os.makedirs(save_directory)

    # 检查当前文件夹,如果文件已经下载则跳过
    download_illust_ids = []
    illust_files = os.listdir(save_directory)
    for illust_file in illust_files:
        # 获取目录或者文件的路径
        if os.path.isdir(os.path.join(save_directory, illust_file)):
            continue

        if os.path.getsize(os.path.join(save_directory, illust_file)) <= 100:
            continue

        # 提取 illust_id
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(
                illust_file))
            continue
        download_illust_ids.append(illust_id)

    log.info('The illustrations size is: {}'.format(len(illustrations)))
    for illustration in illustrations:
        if illustration.id in download_illust_ids:
            log.info('The illus was downloaded. illust_id: {}'.format(
                illustration.id))
            continue

        download_by_illustration_id(save_directory, illustration.id, **kwargs)
    update_user_tag(user_id, 'download')
    log.info('end download illust by user_id: {}'.format(user_id))
Exemple #9
0
def update_dir_user_tag(source_dir, tag, replace=True):
    """
    更新source_dir文件夹下的所有子文件夹中的user_id的标签
    :param source_dir: 需要处理的文件夹
    :param tag: 更新的标签,如download,favorite
    :param replace: 是否替换原来的标签
    :return: None
    """
    if not os.path.exists(source_dir):
        log.error('The directory is not exist: {}'.format(source_dir))
        return
    paths = os.listdir(source_dir)
    for path in paths:
        # 用户都是文件夹
        if not os.path.isdir(os.path.join(source_dir, path)):
            continue
        user_id = get_illust_id(path)
        if user_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(path))
            continue
        update_user_tag(user_id, tag, replace=True)
def train_main_colors(illust_directory):
    log.info('begin train main colors.')
    save_cache_file = r'.\cache\main_color.txt'
    # save_cache_file_handle = open(save_cache_file, 'w+', encoding='utf-8')
    illust_main_colors = {}
    if os.path.isfile(save_cache_file):
        illust_main_colors = json.load(
            open(save_cache_file, 'r', encoding='utf-8'))
    illust_files = os.listdir(illust_directory)
    for illust_file in illust_files:
        illust_file = os.path.join(illust_directory, illust_file)
        if os.path.isdir(illust_file):
            log.info('The file is directory: {}'.format(illust_file))
            continue
        illust_id = get_illust_id(illust_file)
        if illust_id is None:
            log.info('The file illust_id is None: {}'.format(illust_file))
            continue
        if str(illust_id) in illust_main_colors:
            log.info('The file has been trained: {}'.format(illust_file))
            continue
        clusters, label_count = rgb_kmeans(illust_file)
        main_colors = []
        for label in label_count:
            main_colors.append({
                'illust_id': illust_id,
                'index': int(label),
                'count': label_count[label],
                'color': clusters[label].tolist()
            })
        main_colors.sort(key=lambda x: x['count'], reverse=True)
        illust_main_colors[illust_id] = main_colors
        json.dump(illust_main_colors,
                  open(save_cache_file, 'w', encoding='utf-8'),
                  ensure_ascii=False,
                  indent=4)
    log.info('end train main colors.')
    return illust_main_colors