Ejemplo n.º 1
0
def classify_main_color(illust_directory):
    log.info('begin classify main colors.')
    train_result_file = r'.\cache\main_color.txt'
    collect_directory = r'..\crawler\result\illusts\30000-40000\white'
    if not os.path.isdir(collect_directory):
        os.makedirs(collect_directory)

    if not os.path.isfile(train_result_file):
        log.error(
            'The train result file is not exist: {}'.format(train_result_file))
        return
    log.info('read train info finish.')
    illust_main_colors = json.load(
        open(train_result_file, 'r', encoding='utf-8'))
    for illust_id in illust_main_colors:
        main_colors = illust_main_colors[illust_id]
        main_colors.sort(key=lambda x: x['count'], reverse=True)

    illust_files = get_directory_illusts(illust_directory)
    for illust_file in illust_files:
        illust_id = illust_file['illust_id']
        if str(illust_id) not in illust_main_colors:
            log.warn(
                'The illust has not main colors info. illust_id: {}'.format(
                    illust_id))
            continue
        main_colors = illust_main_colors[str(illust_id)]
        if min(main_colors[0]['color']) > 220 and min(
                main_colors[1]['color']) > 200 and min(
                    main_colors[2]['color']) > 200:
            # 主要颜色是白色
            log.info('white illust. collect: {}'.format(illust_id))
Ejemplo n.º 2
0
def update_dir_illust_tag(directory: str, tag: str):
    """
    将某个文件夹下的所有文件在illust数据库中的记录标记tag
    :param directory: 目标文件夹
    :param tag: 某个类型的标记名称,
               ignore: 校验过不需要的插画
               downloaded: 已经下载的图片
               small: 图片太小
               delete: 直接删除
               too_long: 太长啦,一帮是那种漫画
               gray: 黑白插画
    :return: None
    """
    if not os.path.exists(directory):
        log.error('The directory is not exist: {}'.format(directory))
        return
    illust_files = os.listdir(directory)
    for illust_file in illust_files:
        # 获取目录或者文件的路径
        if os.path.isdir(os.path.join(directory, illust_file)):
            continue
        log.info('process file: ' + illust_file)
        # 提取 illust_id
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(illust_file))
            continue
        update_illustration_tag(illust_id, tag)
        # os.remove(os.path.join(directory, illust_file))
    log.info('process end. total illust size: {}'.format(len(illust_files)))
Ejemplo n.º 3
0
def collect_sub_files(source_root_directory, move_target_directory):
    """
    遍历所有子文件,然后移动到另一个地方,避免有些下载的文件嵌套太深
    可以批量把某个文件夹下的所有文件移动到指定的目录下
    :param source_root_directory: 检查的路径
    :param move_target_directory: 统一移动到的路径
    :return:
    """
    if not os.path.isdir(move_target_directory):
        # 文件不存在则创建
        os.makedirs(move_target_directory)
    sub_file_paths = u_file.get_all_sub_files(source_root_directory)

    for sub_file_path in sub_file_paths:
        if os.path.isdir(sub_file_path):
            log.info('The file is directory: {}'.format(sub_file_path))
            continue
        sub_file_name = os.path.split(sub_file_path)[1]
        sub_file_name_suffix = os.path.splitext(sub_file_name)[1]
        if sub_file_name_suffix != '.jpg' and sub_file_name_suffix != '.hdr':
            log.info('The file is not hdr file: {}'.format(sub_file_name))
            continue

        move_target_file_path = os.path.join(move_target_directory,
                                             sub_file_name)
        if os.path.isfile(move_target_file_path):
            log.warn('The move target file is exist: {}'.format(
                move_target_file_path))
            continue

        log.info('move file: {} --> file: {}'.format(sub_file_path,
                                                     move_target_file_path))
        os.replace(sub_file_path, move_target_file_path)
Ejemplo n.º 4
0
def get_json(url, params=None, headers=None, **kwargs) -> dict:
    """
    request json from url
    :param url: url
    :param params: params
    :param headers: headers
    :return: json
    """
    default_headers = {}
    default_headers.update(COMMON_HEADERS)
    if headers is not None:
        default_headers.update(headers)
    try:
        response = requests.get(url,
                                params=params,
                                headers=default_headers,
                                verify=False,
                                **kwargs)
    except Exception as e:
        log.warn('request error and try again. {}'.format(e))
        response = requests.get(url,
                                params=params,
                                headers=default_headers,
                                verify=False,
                                **kwargs)
    return json.loads(response.text)
Ejemplo n.º 5
0
def download_task_by_user_id(user_id=None,
                             illust_id=None,
                             save_dir=None,
                             check_download=True,
                             **kwargs):
    # 通过插画id查询对应的用户id
    if illust_id is not None:
        illust: Illustration = session.query(Illustration).get(illust_id)
        if illust is not None:
            user_id = illust.user_id

    # 如果给定了文件夹,一般是补充该用户的插画,尝试从文件夹中解析user_id
    if user_id is None and save_dir is not None:
        parse_user_id = get_illust_id(save_dir)
        if parse_user_id >= 0:
            user_id = parse_user_id

    if user_id is None:
        log.error('The user_id is not valid.')
        return

    # 如果check_download=true,则不再下载,如果是补充下载要设为false
    if check_download and is_download_user(user_id):
        log.warn('The user hase been download. user_id: {}'.format(user_id))
        return

    if save_dir is None:
        # 未给定用户文件夹,则新建一个
        save_dir = os.path.join(r'.\result\by-user', str(user_id))
    download_by_user_id(save_dir,
                        user_id,
                        skip_download=False,
                        skip_max_page_count=10,
                        split_r_18=False,
                        **kwargs)
Ejemplo n.º 6
0
def get_track_info(track_id) -> dict:
    track_param = {
        'device': 'android',
        'trackId': track_id
    }
    response = requests.get(URL['track_info'], params=track_param, headers=HEADERS)
    u_log.info('get track info success. trackId: {}'.format(track_id))
    track_info: dict = json.loads(response.text)
    if track_info.get('ret') != 0 or 'trackInfo' not in track_info:
        u_log.warn('The response is not contains trackInfo. {}'.format(response.text))
        return {}
    track_info = track_info.get('trackInfo')

    intro_param = {
        'ac': 'WIFI',
        'device': 'android',
        'supportWebp': 'true',
        'trackId': track_id,
        'trackUid': 29200911
    }
    response = requests.get(URL['track_intro'], params=intro_param, headers=HEADERS)
    u_log.info('get track rich intro info success. trackId: {}'.format(track_id))
    track_intro_info = json.loads(response.text)
    if track_intro_info.get('ret') != 0 or 'richIntro' not in track_intro_info:
        u_log.warn('The response is not contains richIntro. {}'.format(response.text))
        return track_info
    track_info['richIntro'] = track_intro_info.get('richIntro')
    u_log.info('get all track info success. trackId: {}'.format(track_id))
    return track_info
Ejemplo n.º 7
0
def check_user_id(source_dir: str,
                  user_dir: str,
                  user_id=None,
                  keep_source=True,
                  use_cache=True,
                  replace_user_file=False):
    """
    检查和移动某个用户下的图片到目标文件夹
    :param user_id: 指定用户id
    :param source_dir: 需要处理的文件夹
    :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹
    :param keep_source: 是否保留原来的文件,如果存在重复的时候生效
    :param use_cache: 是否使用缓存中的文件目录
    :param replace_user_file: 是否替换掉用户文件夹中的文件
    :return:
    """
    if not os.path.isdir(user_dir):
        log.error(
            'The user directory is not exist. directory: {}'.format(user_dir))
        return None

    parse_user_id = get_illust_id(user_dir)
    if user_id is None and parse_user_id >= 0:
        user_id = parse_user_id

    image_meta_infos = get_image_meta_infos(source_dir, use_cache)
    log.info('total image file size: {}'.format(len(image_meta_infos)))

    index = 0
    move_file_size = 0
    for image_meta_info in image_meta_infos:
        index += 1
        # if index % 1000 == 0:
        #     log.info('processed file size: {}'.format(index))
        if image_meta_info.get('user_id') != user_id:
            continue

        if not os.path.isfile(image_meta_info.get('path')):
            log.info('The file was delete. path: {}'.format(
                image_meta_info.get('path')))
            continue

        log.info('The illust({}) is belong user_id({}).'.format(
            image_meta_info.get('illust_id'), user_id))
        move_target_path = os.path.join(user_dir,
                                        image_meta_info.get('file_name'))
        if os.path.isfile(move_target_path):
            log.warn('The target user illust is exist: {}, keep: {}'.format(
                move_target_path, keep_source))
            if keep_source:
                continue

        move_file_size += 1
        if replace_user_file:
            log.info('begin move file from: {} to : {}'.format(
                image_meta_info.get('path'), move_target_path))
            os.replace(image_meta_info.get('path'), move_target_path)
    log.info('end check user_id, hit file size: {}, dir: {}'.format(
        move_file_size, user_dir))
Ejemplo n.º 8
0
def compare_similarity():
    log.info('begin')
    directory = r'H:\Pictures\动漫插画\东方Project\爱丽丝·玛格特罗依德\small'
    sim_directory = os.path.join(directory, 'sim')
    if not os.path.isdir(sim_directory):
        os.makedirs(sim_directory)
    image_paths = get_all_image_paths(directory, use_cache=False)
    dimension = 200
    log.info('all image size: {}'.format(len(image_paths)))
    similarities = []
    i = 0
    while i < len(image_paths):
        # check file
        if not os.path.isfile(image_paths[i]):
            log.warn('The file is not exist: {}'.format(image_paths[i]))
            i += 1
            continue
        log.info('source image path: {}'.format(image_paths[i]))
        j = i + 1
        while j < len(image_paths):
            # check file
            if not os.path.isfile(image_paths[j]):
                log.warn('The file is not exist: {}'.format(image_paths[j]))
                j += 1
                continue
            log.info('compare similarity: image1: {}, image2: {}'.format(
                image_paths[i], image_paths[j]))
            similarity, image1, image2 = similarity_hist(
                image_paths[i], image_paths[j], dimension)
            similarities.append({
                'source_path': image_paths[i],
                'target_path': image_paths[j],
                'similarity': similarity
            })
            log.info('similarity: {}'.format(similarity))
            if similarity >= 0.99:
                log.info('move file. similarity: {}, file: {}'.format(
                    similarity, image_paths[j]))
                os.replace(
                    image_paths[j],
                    os.path.join(
                        sim_directory,
                        str(i) + '-' + os.path.split(image_paths[j])[1]))
            j += 1
            # plt.subplot(121)
            # plt.imshow(image1)
            # plt.subplot(122)
            # plt.imshow(image2)
            # plt.show()
            # break
        # break
        i += 1
    i = 0
    similarities.sort(key=lambda x: x['similarity'], reverse=True)
    log.info('the most similarity: {}'.format(similarities[0]))
Ejemplo n.º 9
0
def read_content(file_path):
    """
    read content from file, use UTF-8 encoding
    :param file_path: target file path
    :return: file content
    """
    if not os.path.isfile(file_path):
        log.warn('The file is not exist')
        return None
    log.info('read content from file: {}'.format(file_path))
    fin = open(file_path, 'r', encoding='UTF-8')
    content = fin.read()
    fin.close()
    return content
Ejemplo n.º 10
0
def get_all_page_book_list(template_url: str) -> list:
    max_page_size = 100
    book_infos = []
    for index in range(1, max_page_size):
        url = template_url.format(index)
        page_book_infos = get_book_list(url)
        if len(page_book_infos) == 0:
            log.warn('The book infos is empty. end crawler.')
            break
        book_infos.extend(page_book_infos)
        log.info('end crawler url: {}, book size: {}'.format(
            url, len(page_book_infos)))
        u_file.cache_json(book_infos, r'result/total_book_info.json')
    return book_infos
Ejemplo n.º 11
0
def get_post_info(page) -> list:
    params = {
        'page': page,
        'limit': 100,
        # 'tags': 'chintora0201'
    }
    posts = u_file.get_json(CRAWL_URLS.get('post'), params)
    if not isinstance(posts, list):
        u_log.warn("The response is not post list.")
        return []
    u_log.info('post size: {}'.format(len(posts)))
    for post in posts:
        save_post(post)
        u_log.info('save post success. post_id: {}'.format(post.get('id')))
    return posts
Ejemplo n.º 12
0
def get_illust_id(illust_file_path: str) -> int:
    """
    通过文件名,提取插画pixiv_id
    :param illust_file_path: 插画路径,可以使相对路径,绝对路径或者文件名
    :return: 插画id,如果没有则返回-1
    """
    illust_filename = os.path.split(illust_file_path)[1]
    illust_id = illust_filename.split('_')[0]
    if illust_id.isdigit():
        return int(illust_id)
    illust_id = illust_filename.split('-')[0]
    if illust_id.isdigit():
        return int(illust_id)
    log.warn('The illust_id is error. illust_file: {}'.format(illust_file_path))
    return -1
Ejemplo n.º 13
0
def rget(data, keys, default=None):
    """
    递归获取dict数据
    """
    key = keys.pop(0)
    try:
        elem = data[key]
    except KeyError:
        return default
    except TypeError:
        log.warn('The data is not dict: {}'.format(data))
        return None
    if not keys:
        return elem
    return rget(elem, keys, default)
Ejemplo n.º 14
0
def download_by_user_id(save_directory,
                        user_id: int,
                        min_total_bookmarks=5000,
                        **kwargs):
    log.info('begin download illust by user_id: {}'.format(user_id))
    illustrations: [Illustration] = session.query(Illustration)\
        .filter(Illustration.user_id == user_id)\
        .filter(Illustration.total_bookmarks >= min_total_bookmarks)\
        .order_by(Illustration.total_bookmarks.desc()).all()
    if illustrations is None or len(illustrations) <= 0:
        log.warn('The illustrations is empty. user_id: {}'.format(user_id))
        return

    if not os.path.isdir(save_directory):
        os.makedirs(save_directory)

    # 检查当前文件夹,如果文件已经下载则跳过
    download_illust_ids = []
    illust_files = os.listdir(save_directory)
    for illust_file in illust_files:
        # 获取目录或者文件的路径
        if os.path.isdir(os.path.join(save_directory, illust_file)):
            continue

        if os.path.getsize(os.path.join(save_directory, illust_file)) <= 100:
            continue

        # 提取 illust_id
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(
                illust_file))
            continue
        download_illust_ids.append(illust_id)

    log.info('The illustrations size is: {}'.format(len(illustrations)))
    for illustration in illustrations:
        if illustration.id in download_illust_ids:
            log.info('The illus was downloaded. illust_id: {}'.format(
                illustration.id))
            continue

        download_by_illustration_id(save_directory, illustration.id, **kwargs)
    update_user_tag(user_id, 'download')
    log.info('end download illust by user_id: {}'.format(user_id))
Ejemplo n.º 15
0
def convert_image_format(image_path, delete=False):
    """
    转换WEBP的图片格式到JPEG
    :param image_path: 图片地址,最好是绝对路径
    :param delete: 是否删除原来的图片
    :return:
    """
    if not os.path.isfile(image_path):
        log.warn('The image is not exist. path: {}'.format(image_path))
        return None
    image = Image.open(image_path)
    image_format = image.format
    # 如果是webp格式转为jpeg格式
    if image_format == 'WEBP':
        image.save(image_path, 'JPEG')
    image.close()
    if delete:
        os.remove(image_path)
Ejemplo n.º 16
0
def read_file_as_list(file_path: str) -> list:
    """
    按行读取文件,并返回list,每一个元素是每一行记录
    :param file_path: 文件绝对地址
    :return:
    """
    if not os.path.isfile(file_path):
        log.warn('The file is not exist. {}'.format(file_path))
        return []
    file_handle = open(file_path, 'r', encoding='utf-8')
    line = file_handle.readline()
    contents = set()
    while line:
        line = line.strip('\n')
        contents.add(line)
        line = file_handle.readline()
    file_handle.close()
    log.info('read file end. list size: {}'.format(len(contents)))
    return list(contents)
Ejemplo n.º 17
0
def replace_file_name(source_root_directory, replace_ad_str):
    """
    一般用来去掉下载文件中的广告
    :param replace_ad_str: 需要替换掉的广告文字
    :param source_root_directory: 处理的文件夹
    :return:
    """
    sub_file_paths = u_file.get_all_sub_files(source_root_directory)

    for sub_file_path in sub_file_paths:
        move_target_file_path = sub_file_path.replace(replace_ad_str, '')
        if os.path.isfile(move_target_file_path):
            log.warn(
                'The target file is exist: {}'.format(move_target_file_path))
            continue

        log.info('rename file: {} --> file: {}'.format(sub_file_path,
                                                       move_target_file_path))
        os.replace(sub_file_path, move_target_file_path)
Ejemplo n.º 18
0
def extract_pose_urls(html_content):
    if not html_content:
        log.info('The html content is not valid.')
        return False
    soup = BeautifulSoup(html_content, 'lxml')
    content_node = soup.find(id='content')
    if not content_node:
        log.warn('The content node is not valid.')
        return False
    pose_img_nodes = content_node.select('div.block1 > ul.list > li > a > img')
    pose_urls = []
    for pose_img_node in pose_img_nodes:
        pose_url = pose_img_node['src']
        if pose_url and pose_url != '':
            pose_url = pose_url.replace('_thumb', '')
            pose_url = CONFIG.get('host') + pose_url
            pose_urls.append(pose_url)
    log.info('extract pos urls success. size: {}'.format(len(pose_urls)))
    return pose_urls
Ejemplo n.º 19
0
def move_small_file(target_directory: str,
                    min_width=800,
                    min_height=800,
                    min_size=10000,
                    use_cache=True,
                    move_directory=None):
    # 如果未指定移动小文件的目标文件夹,则在当前文件夹下生存一个small的子文件夹
    if move_directory is None:
        move_directory = os.path.join(target_directory, 'small')
        if not os.path.isdir(move_directory):
            os.makedirs(move_directory)

    image_meta_infos = get_image_meta_infos(target_directory, use_cache)
    log.info('total image file size: {}'.format(len(image_meta_infos)))

    for image_meta_info in image_meta_infos:
        if not os.path.isfile(image_meta_info.get('path')):
            log.warn('The file is deleted. path: {}'.format(
                image_meta_info.get('path')))
            continue
        move_target_path = os.path.join(move_directory,
                                        image_meta_info.get('file_name'))
        if os.path.isfile(move_target_path):
            log.warn('The move file is exist: {}'.format(move_target_path))

        if image_meta_info.get('size') <= min_size:
            log.info('The file is small. size: ({}/{})'.format(
                image_meta_info.get('size'), min_size))
            log.info('begin move file from: {} to : {}'.format(
                image_meta_info.get('path'), move_target_path))
            os.replace(image_meta_info.get('path'), move_target_path)

        if image_meta_info.get('width') <= min_width and image_meta_info.get(
                'height') <= min_height:
            log.info(
                'The file is small, width: ({}/{}), height: ({}/{})'.format(
                    image_meta_info.get('width'), min_width,
                    image_meta_info.get('height'), min_height))
            log.info('begin move file from: {} to : {}'.format(
                image_meta_info.get('path'), move_target_path))
            os.replace(image_meta_info.get('path'), move_target_path)
    log.info('end move small file')
Ejemplo n.º 20
0
def update_dir_user_tag(source_dir, tag, replace=True):
    """
    更新source_dir文件夹下的所有子文件夹中的user_id的标签
    :param source_dir: 需要处理的文件夹
    :param tag: 更新的标签,如download,favorite
    :param replace: 是否替换原来的标签
    :return: None
    """
    if not os.path.exists(source_dir):
        log.error('The directory is not exist: {}'.format(source_dir))
        return
    paths = os.listdir(source_dir)
    for path in paths:
        # 用户都是文件夹
        if not os.path.isdir(os.path.join(source_dir, path)):
            continue
        user_id = get_illust_id(path)
        if user_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(path))
            continue
        update_user_tag(user_id, tag, replace=True)
Ejemplo n.º 21
0
def parse_and_save_grammar_json(file_path: str):
    """
    讲语法讲解存入数据库中
    :param file_path:
    :return:
    """
    grammar_categories = u_file.load_json_from_file(file_path)
    if not grammar_categories or not 'data' in grammar_categories:
        log.warn('The grammar json is invalid: {}'.format(str))
        return

    log.info('load grammar json success. category size: {}'.format(len(grammar_categories)))
    grammar_categories = grammar_categories.get('data')
    for grammar_category in grammar_categories:
        log.info('parse grammar category: {}'.format(grammar_category.get('title')))
        if grammar_category.get('title') != grammar_category.get('label'):
            log.warn('The grammar title and label is not same.')
        grammars = grammar_category.get('grammerList')
        log.info('parse grammar category sub grammar. category: {}, grammar size: {}'
                   .format(grammar_category.get('title'), len(grammars)))
        for grammar in grammars:
            if grammar.get('explain') != grammar.get('comment') or grammar.get('type') != grammar.get('category') \
                    or grammar.get('category') != grammar_category.get('title'):
                log.warn('The grammar category is special. grammar: {}'.format(grammar.get('grammar')))
            log.info('get grammar: {}'.format(grammar.get('grammar')))
            db_grammar = Grammar(id=grammar.get('id'), content=grammar.get('content'))
            db_grammar.level = grammar.get('level')
            db_grammar.category = grammar.get('category')
            db_grammar.type = grammar.get('category')
            db_grammar.link = grammar.get('link')
            db_grammar.explain = grammar.get('explain')
            db_grammar.example = re.sub('[#@][0-9]*', '', grammar.get('exmple'))
            db_grammar.postscript = grammar.get('ps')
            save_grammar(db_grammar)
Ejemplo n.º 22
0
def get_album_tracks(album_id) -> list:
    page_id = 1
    max_page = 2
    page_size = 130

    base_track_infos: list = []
    while page_id < max_page:
        track_info = get_album_track_info_page(album_id, page_id, page_size)
        if 'maxPageId' not in track_info:
            u_log.warn('The maxPageId is not exist. unknown response.')
            break

        for track in track_info.get('list'):
            base_track_infos.append({
                'trackId': track.get('trackId'),
                'title': track.get('title'),
                'duration': track.get('duration')
            })
        # max_page = track_info.get('maxPageId')
        page_id += 1
    u_log.info('track size: {}'.format(len(base_track_infos)))
    return base_track_infos
Ejemplo n.º 23
0
def save_post(post_info):
    if not post_info or 'id' not in post_info or not post_info.get('id'):
        u_log.warn('post_info format is error: {}'.format(post_info))
        return None

    if session.query(Post).filter(Post.id == post_info.get('id')).first() is not None:
        u_log.info("The illustration is exist. illust_id: {}".format(post_info.get('id')))
        return None

    post = Post(id=post_info.get('id'), tags=post_info.get('tags')[:700])
    post.author = post_info.get('author')
    post.source = post_info.get('source')[:500]
    post.score = post_info.get('score')
    post.md5 = post_info.get('md5')
    post.file_size = post_info.get('file_size')
    post.sample_file_size = post_info.get('sample_file_size')
    post.jpeg_file_size = post_info.get('jpeg_file_size')
    post.file_ext = post_info.get('file_ext')
    post.file_url = post_info.get('file_url')
    post.preview_url = post_info.get('preview_url')
    post.sample_url = post_info.get('sample_url')
    post.jpeg_url = post_info.get('jpeg_url')
    post.preview_width = post_info.get('preview_width')
    post.preview_height = post_info.get('preview_height')
    post.actual_preview_width = post_info.get('actual_preview_width')
    post.actual_preview_height = post_info.get('actual_preview_height')
    post.sample_width = post_info.get('sample_width')
    post.sample_height = post_info.get('sample_height')
    post.jpeg_width = post_info.get('jpeg_width')
    post.jpeg_height = post_info.get('jpeg_height')
    post.width = post_info.get('width')
    post.height = post_info.get('height')
    post.status = post_info.get('status')
    post.rating = post_info.get('rating')
    post.parent_id = post_info.get('parent_id')
    post.has_children = post_info.get('has_children')
    session.merge(post)
    session.commit()
Ejemplo n.º 24
0
def get_album_track_info_page(album_id, page_id, page_size=20) -> dict:
    page_param = {
        'ac': 'WIFI',
        'albumId': album_id,
        'device': 'android',
        'isAsc': 'false',
        'isQueryInvitationBrand': 'true',
        'isVideoAsc': 'true',
        'pageId': page_id,
        'pageSize': page_size,
        'pre_page': '0',
        'source': '2',
        'supportWebp': 'true'
    }
    response = requests.get(URL['album'], params=page_param, headers=HEADERS)
    album_info: dict = json.loads(response.text)
    if album_info.get('ret') != 0 or 'data' not in album_info or 'tracks' not in album_info.get('data') \
            or 'list' not in album_info.get('data').get('tracks'):
        u_log.warn('The response is not contains tracks. {}'.format(response.text))
        return {}
    u_log.info('get track infos success, album_id: {}'.format(album_id))
    track_info = album_info.get('data').get('tracks')
    u_log.info('tracks total count: {}'.format(track_info.get('totalCount')))
    return track_info
Ejemplo n.º 25
0
def get_image_meta_infos(target_directory: str, use_cache=True) -> list:
    cache_file_path = get_cache_path(target_directory, 'meta-info', 'json')
    cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                   cache_file_path)
    if use_cache and os.path.isfile(cache_file_path):
        return json.load(open(cache_file_path, 'r', encoding='utf-8'))
    image_meta_infos = []

    image_paths = get_all_image_paths(target_directory, use_cache)
    log.info('total image file size: {}'.format(len(image_paths)))
    index = 0
    for image_path in image_paths:
        index += 1
        illust_id = get_illust_id(image_path)
        # log.info('get illust_id: {} ({}/{})'.format(illust_id, index, len(image_paths)))

        if illust_id < 0:
            log.warn(
                'The illust is not format. image_path: {}'.format(image_path))
            continue

        if not os.path.isfile(image_path):
            log.warn(
                'The illust was deleted. image_path: {}'.format(image_path))
            continue

        illustration: Illustration = session.query(Illustration).get(illust_id)
        if illustration is None:
            log.warn('The illustration is not exist. illust_id: {}'.format(
                illust_id))
            continue

        image_meta_infos.append({
            'width': illustration.width,
            'height': illustration.height,
            'path': image_path,
            'file_name': os.path.split(image_path)[1],
            'illust_id': illust_id,
            'user_id': illustration.user_id,
            'size': os.path.getsize(image_path),
            'r_18': illustration.r_18,
            'bookmarks': illustration.total_bookmarks,
            'tag': illustration.tag
        })
    log.info('get_image_meta_infos end. image size: {}'.format(
        len(image_meta_infos)))
    json.dump(image_meta_infos,
              open(cache_file_path, 'w', encoding='utf-8'),
              ensure_ascii=False,
              indent=4)
    return image_meta_infos
Ejemplo n.º 26
0
def crawl_rank_illust_info():
    max_page_count = 10
    is_r18 = False
    date_offset_file = r'.\config\offset-r-18.json' if is_r18 else r'.\config\offset.json'
    date_offset_info = json.load(open(date_offset_file, encoding='utf-8'))
    log.info('init date_offset_info success. {}'.format(date_offset_info))

    pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS)
    pixiv_api.auth(refresh_token=_REFRESH_TOKEN)
    query_date = datetime.datetime.strptime(date_offset_info.get('date'),
                                            '%Y-%m-%d').date()
    now = datetime.date.today()
    total_query_count = 0
    sleep_second = 10  # 触发频率限制的等待时间
    log.info('------------begin crawler-------------')
    while query_date < now:
        # 依次查询每一天的排行榜
        page_index = 0
        next_url_options = {
            'mode': 'day_r18' if is_r18 else 'day',
            'date': query_date,
            'offset': date_offset_info.get('offset')
        }
        log.info('begin crawl date: {}, offset: {}'.format(
            query_date, date_offset_info.get('offset')))
        while page_index < max_page_count:
            # 每天查询 max_page_count 次
            log.info('begin crawl illust info({}/{}). options: {}'.format(
                page_index, max_page_count, next_url_options))
            illusts = pixiv_api.illust_ranking(**next_url_options)
            log.info('end crawl illust info({}/{}). options: {}'.format(
                page_index, max_page_count, next_url_options))
            # illusts = json.load(open(r"../mysql/entity_example/rank-1.json", encoding='utf8'))
            if not illusts.get('illusts'):
                # 查询结果为空,分两种情况,一种是发生错误,一种是没有数据了
                log.warn('The response illusts is empty: {}'.format(illusts))
                if 'error' not in illusts:
                    # 不是发生了错误,那就是这天的数据已经爬完了,接着爬明天的
                    log.info(
                        'The response is not error. It means today illusts are crawled finish.'
                    )
                    break

                if illusts.get('error').get('message',
                                            '').find('Rate Limit') >= 0:
                    # 访问频率限制则等待一下继续重试
                    log.warn('Touch Rate Limit. sleep {} second.'.format(
                        sleep_second))
                    time.sleep(sleep_second)
                if illusts.get('error').get('message', '').find('OAuth') >= 0:
                    # token过期(一个小时就会过期),刷新token然后重试
                    log.warn("Access Token is invalid, refresh token.")
                    pixiv_api.auth()
                continue

            # 提取下次的爬取连接,并把数据保存
            log.info('extract next url: {}'.format(illusts.get('next_url')))
            next_url_options = pixiv_api.parse_next_url_options(
                illusts.get('next_url'))
            total_query_count += 1
            page_index += 1
            log.info("crawl success. illust size: {}, begin save info to db.".
                     format(len(illusts.get('illusts'))))
            for illust in illusts.get('illusts'):
                illust['r_18'] = is_r18
                save_illustration(illust)

            log.info(
                'crawl illust save database success. illust size: {}'.format(
                    len(illusts.get('illusts'))))
            # 将爬取的时间和偏移持久化,即使中断下次也可以接着爬
            date_offset_info['date'] = str(query_date)
            date_offset_info['offset'] = next_url_options['offset']
            json.dump(date_offset_info,
                      open(date_offset_file, 'w', encoding='utf-8'),
                      ensure_ascii=False,
                      indent=4)

        # 爬取下一天的数据
        query_date = query_date + datetime.timedelta(days=1)
        date_offset_info['offset'] = 0
    log.info('------------end crawler-------------')
    log.info('total query count: {}'.format(total_query_count))
Ejemplo n.º 27
0
def download_by_illustration_id(directory: str, illustration_id: int,
                                **kwargs):
    default_kwargs = {
        'spilt_bookmark': False,  # 是否根据收藏量来分割文件夹
        'split_r_18': True,  # 是否把r-18的文件放在单独的文件夹
        'skip_download': True,  # 是否跳过标记为 downloaded 的插画
        'skip_min_width': 800,  # 跳过下载的最小宽度,低于该值的插画不下载
        'skip_min_height': 800,  # 跳过下载的最小长度,低于该值的插画不下载
        'skip_max_page_count': 3,  # 超过多少张画则跳过
        'skip_ignore': True,  # 已经标记为ignore的不下载
    }
    default_kwargs.update(kwargs)
    kwargs = default_kwargs

    pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS)
    pixiv_api.auth(refresh_token=_REFRESH_TOKEN)

    log.info(
        'begin download illust by illustration_id: {}'.format(illustration_id))
    illustration: Illustration = session.query(Illustration).get(
        illustration_id)
    if illustration is None:
        log.error(
            'The illustration(id: {}) is not exist.'.format(illustration_id))
        return
    illustration_images: [IllustrationImage] = session.query(IllustrationImage)\
        .filter(IllustrationImage.illust_id == illustration_id).all()
    if illustration_images is None or len(illustration_images) == 0:
        log.error('The illustration(id: {}) image is not exist.'.format(
            illustration_id))
        return

    # 超过3幅的画,大多是漫画类型,先不管
    if len(illustration_images) > kwargs.get('skip_max_page_count'):
        log.warn('The illustration(id: {}) images are more than {}.'.format(
            illustration_id, kwargs.get('skip_max_page_count')))
        return

    # 过滤长度和宽度
    if illustration.width < kwargs.get(
            'skip_min_width') or illustration.height < kwargs.get(
                'skip_min_height'):
        log.warn(
            'The illustration(id: {}) image is small, width: {}/{}, height: {}/{}'
            .format(illustration_id, illustration.width,
                    kwargs.get('skip_min_width'), illustration.height,
                    kwargs.get('skip_min_height')))
        return

    # 已经标记为忽略的不下载
    if kwargs.get(
            'skip_ignore'
    ) and illustration.tag == 'ignore' or illustration.tag == 'small':
        log.warn('The illustration(id: {}) is ignore.'.format(illustration_id))
        return

    # 按照收藏点赞人数分文件夹
    if kwargs.get('spilt_bookmark'):
        directory += '/' + '-'.join(
            str(i) for i in get_10_20(illustration.total_bookmarks))

    # R18放在子文件夹
    if kwargs.get(
            'split_r_18'
    ) and illustration.r_18 is not None and illustration.r_18 == 1:
        directory += "/r-18"

    for illustration_image in illustration_images:
        if illustration_image.image_url_origin is None or illustration_image.image_url_origin == '':
            log.info(
                'The illustration_image(id: {}) image_url_origin is none.'.
                format(illustration_id))
            continue
        if kwargs.get('skip_download'
                      ) and illustration_image.process == 'DOWNLOADED':
            log.info(
                'The illustration_image(id: {}) has been downloaded.'.format(
                    illustration_id))
            continue
        log.info('begin process illust_id: {}, image_url: {}'.format(
            illustration_image.illust_id, illustration_image.image_url_origin))
        download_task(pixiv_api,
                      directory,
                      illustration_image=illustration_image)
        illustration_image.process = 'DOWNLOADED'
        session.merge(illustration_image)
        session.commit()
        log.info('end process illust_id: {}'.format(
            illustration_image.illust_id))
    log.info(
        'begin download illust by illustration_id: {}, illust image size: {}'.
        format(illustration_id, len(illustration_images)))