コード例 #1
0
def is_special_illust_ids(illust_path: str = None, **kwargs) -> bool:
    if not kwargs.get('user_id') and not kwargs.get('illust_id'):
        log.error('The user_id or illust_id is empty.')
        return False
    user_id = kwargs.get('user_id')
    cache_illust_ids_path = os.path.dirname(__file__)
    cache_illust_ids_path = os.path.join(
        cache_illust_ids_path,
        r'.\cache\\' + str(user_id) + '-illust-ids.json')
    if not os.path.isfile(cache_illust_ids_path):
        # 某个用户的illust_id
        illust_ids = session.query(Illustration).filter(Illustration.user_id == user_id)\
            .order_by(Illustration.total_bookmarks.desc()).all()
        illust_ids = [x.id for x in illust_ids]
        log.info('query user_id: {}, illust_ids_size: {}'.format(
            user_id, len(illust_ids)))
        json.dump(illust_ids,
                  open(cache_illust_ids_path, 'w', encoding='utf-8'),
                  ensure_ascii=False,
                  indent=4)
    else:
        illust_ids = json.load(
            open(cache_illust_ids_path, 'r', encoding='utf-8'))
    current_illust_id = get_illust_id(illust_path)
    return current_illust_id in illust_ids
コード例 #2
0
def d_hash(image):
    """
    差异值哈希算法(dHash):
    相比pHash,dHash的速度要快的多,相比aHash,dHash在效率几乎相同的情况下的效果要更好,它是基于渐变实现的。
    dHash的hanming距离步骤:
    1. 先将图片压缩成9*8的小图,有72个像素点
    2. 将图片转化为灰度图
    3. 计算差异值:dHash算法工作在相邻像素之间,这样每行9个像素之间产生了8个不同的差异,一共8行,则产生了64个差异值,或者是32位01字符串。
    4. 获得指纹:如果左边的像素比右边的更亮,则记录为1,否则为0.
    5. 通过hash值来计算汉明距离
    :param image:
    :return: hash_str
    """
    image = cv2.resize(image, (9, 8))
    # 转换灰度图
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    hash_str = ''
    # 每行前一个像素大于后一个像素为1,相反为0,生成哈希
    for i in range(8):
        for j in range(8):
            if gray[i, j] > gray[i, j + 1]:
                hash_str = hash_str + '1'
            else:
                hash_str = hash_str + '0'
    result = ''
    for i in range(0, 64, 4):
        result += ''.join('%x' % int(hash_str[i:i + 4], 2))
    log.info('The image d_hash is: {}'.format(result))
    return result
コード例 #3
0
ファイル: collect.py プロジェクト: youyouzh/python-base
def collect_illusts(collect_tag='back', collect_function=None, max_collect_count=10, **kwargs):
    """
    将满足某个条件的插画全部移动到指定的收藏文件夹
    :param collect_tag:
    :param collect_function:
    :param max_collect_count:
    :param kwargs:
    :return:
    """
    log.info('begin collect illusts. tag: {}, max_collect_count: {}'.format(collect_tag, max_collect_count))
    default_kwargs = {
        'target_directory': r'G:\Projects\Python_Projects\python-base\spider\pixiv\crawler\result\illusts',
        'use_cache': True
    }
    default_kwargs.update(kwargs)
    kwargs = default_kwargs

    illust_paths = get_all_image_paths(kwargs.get('target_directory'), kwargs.get('use_cache'))
    collect_count = 0
    for illust_path in illust_paths:
        if not os.path.isfile(illust_path):
            # log.warn('The file is not exist: {}'.format(illust_path))
            continue
        if collect_function(illust_path, **kwargs):
            collect_illust(collect_tag, illust_path)
            collect_count += 1
        if collect_count >= max_collect_count:
            break
    log.info('----> total move file count: {}'.format(collect_count))
コード例 #4
0
ファイル: collect.py プロジェクト: youyouzh/python-base
def update_dir_illust_tag(directory: str, tag: str):
    """
    将某个文件夹下的所有文件在illust数据库中的记录标记tag
    :param directory: 目标文件夹
    :param tag: 某个类型的标记名称,
               ignore: 校验过不需要的插画
               downloaded: 已经下载的图片
               small: 图片太小
               delete: 直接删除
               too_long: 太长啦,一帮是那种漫画
               gray: 黑白插画
    :return: None
    """
    if not os.path.exists(directory):
        log.error('The directory is not exist: {}'.format(directory))
        return
    illust_files = os.listdir(directory)
    for illust_file in illust_files:
        # 获取目录或者文件的路径
        if os.path.isdir(os.path.join(directory, illust_file)):
            continue
        log.info('process file: ' + illust_file)
        # 提取 illust_id
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(illust_file))
            continue
        update_illustration_tag(illust_id, tag)
        # os.remove(os.path.join(directory, illust_file))
    log.info('process end. total illust size: {}'.format(len(illust_files)))
コード例 #5
0
ファイル: u_file.py プロジェクト: youyouzh/python-base
def get_all_sub_files(root_path, all_files=None, contain_dir=False):
    """
    递归获取所有子文件列表
    :param root_path: 递归根目录
    :param all_files: 递归过程中的所有文件列表
    :param contain_dir: 返回值是否包含目录
    :return:
    """
    if all_files is None:
        all_files = []

    # root_path 不是目录直接返回file_list
    if not os.path.isdir(root_path):
        return all_files
    else:
        log.info('begin through path: {}'.format(root_path))

    # 获取该目录下所有的文件名称和目录名称
    dir_or_files = os.listdir(root_path)
    for dir_or_file in dir_or_files:
        dir_or_file = os.path.join(root_path, dir_or_file)  # 拼接得到完整路径

        if os.path.isdir(dir_or_file):
            # 如果是文件夹,则递归遍历
            if contain_dir:
                all_files.append(dir_or_file)
            get_all_sub_files(dir_or_file, all_files, contain_dir)
        else:
            # 否则将当前文件加入到 all_files
            all_files.append(os.path.abspath(dir_or_file))
    return all_files
コード例 #6
0
ファイル: crawl.py プロジェクト: youyouzh/python-base
def get_stage_course_info(stage_course_url: str) -> dict:
    """
    提取课程每个步骤详情页的作业题目信息和课程等信息
    :param stage_course_url: 每个步骤课程详情页
    :return: 作业题目信息
    """
    cache_file = r'cache\course-step-info.html'
    html_content = u_file.get_content_with_cache(stage_course_url, cache_file)

    # 返回结果通过js处理成document,只能正则匹配
    json_data = u_file.extract_init_json_data(html_content, INIT_JSON_PARSE_PATTERN)
    questions = json_data['coach']['subjectList']
    log.info('question size: {}'.format(len(questions)))

    # extract question_infos
    question_infos = []
    keep_question_fields = ['id', 'name', 'title', 'tip', 'summary', 'url', 'files', 'questionCount']
    for question in questions:
        question_info = extract_dict_field(question, keep_question_fields)
        question_info['url'] = get_question_url(question['id'])
        question_infos.append(question_info)

    # extract chapter infos
    chapter_infos = json_data['course']['detail']['chapters']
    return {
        'questions': question_infos,
        'chapters': chapter_infos
    }
コード例 #7
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def crawl_video_info(template_page_url: str):
    max_page = 140
    video_infos = []
    parse_url = urlparse(template_page_url)
    for index in range(1, max_page):
        log.info('begin crawl page.({}/{})'.format(index, max_page))
        html_content = u_file.get_content(template_page_url.format(index))
        soup = BeautifulSoup(html_content, 'lxml')

        video_nodes = soup.select('div.stui-vodlist__detail')
        log.info('video size: {}'.format(len(video_nodes)))
        for video_node in video_nodes:
            a_node = video_node.select_one('h4 > a')
            span_node = video_node.select('p.sub > span')
            view_count = int(span_node[2].text.strip())
            like_count = int(span_node[1].text.strip())
            video_infos.append({
                'title':
                a_node.string,
                'url':
                parse_url._replace(path=a_node['href']).geturl(),
                'view':
                view_count,
                'like':
                like_count
            })
        video_infos.sort(key=lambda x: x['like'], reverse=True)
        u_file.cache_json(video_infos, r'result\video-infos.jon')
    return video_infos
コード例 #8
0
def move_test_file(predict_test_file, main_file_path, main_filename):
    """
    移动测试文件
    :param predict_test_file: 测试文件
    :param main_file_path: main文件夹路径
    :param main_filename: main下的class文件名
    :return:
    """
    move_target_test_path = main_file_path.replace('main', 'test')
    move_target_test_path = os.path.join(
        move_target_test_path, main_filename.replace('.java', 'Test.java'))
    log.info('The test file is exist. move {} -> {}'.format(
        predict_test_file, move_target_test_path))

    # 移动文件
    u_file.ready_dir(move_target_test_path)
    os.replace(predict_test_file, move_target_test_path)

    # 修改类中的类名
    handler = open(move_target_test_path, 'r+', encoding='UTF-8')
    content = handler.read()
    handler.seek(0)
    handler.write(
        content.replace(
            os.path.split(predict_test_file)[1].split('.')[0],
            main_filename.replace('.java', 'Test')))
    handler.close()
コード例 #9
0
ファイル: arrange.py プロジェクト: youyouzh/python-base
def check_user_id(source_dir: str,
                  user_dir: str,
                  user_id=None,
                  keep_source=True,
                  use_cache=True,
                  replace_user_file=False):
    """
    检查和移动某个用户下的图片到目标文件夹
    :param user_id: 指定用户id
    :param source_dir: 需要处理的文件夹
    :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹
    :param keep_source: 是否保留原来的文件,如果存在重复的时候生效
    :param use_cache: 是否使用缓存中的文件目录
    :param replace_user_file: 是否替换掉用户文件夹中的文件
    :return:
    """
    if not os.path.isdir(user_dir):
        log.error(
            'The user directory is not exist. directory: {}'.format(user_dir))
        return None

    parse_user_id = get_illust_id(user_dir)
    if user_id is None and parse_user_id >= 0:
        user_id = parse_user_id

    image_meta_infos = get_image_meta_infos(source_dir, use_cache)
    log.info('total image file size: {}'.format(len(image_meta_infos)))

    index = 0
    move_file_size = 0
    for image_meta_info in image_meta_infos:
        index += 1
        # if index % 1000 == 0:
        #     log.info('processed file size: {}'.format(index))
        if image_meta_info.get('user_id') != user_id:
            continue

        if not os.path.isfile(image_meta_info.get('path')):
            log.info('The file was delete. path: {}'.format(
                image_meta_info.get('path')))
            continue

        log.info('The illust({}) is belong user_id({}).'.format(
            image_meta_info.get('illust_id'), user_id))
        move_target_path = os.path.join(user_dir,
                                        image_meta_info.get('file_name'))
        if os.path.isfile(move_target_path):
            log.warn('The target user illust is exist: {}, keep: {}'.format(
                move_target_path, keep_source))
            if keep_source:
                continue

        move_file_size += 1
        if replace_user_file:
            log.info('begin move file from: {} to : {}'.format(
                image_meta_info.get('path'), move_target_path))
            os.replace(image_meta_info.get('path'), move_target_path)
    log.info('end check user_id, hit file size: {}, dir: {}'.format(
        move_file_size, user_dir))
コード例 #10
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def download_ts_file(m3u8_url: str, ts_urls: List[str]):
    save_dir = get_ts_ave_dir(m3u8_url)
    index = 1
    for ts_url in ts_urls:
        file_name = u_file.get_file_name_from_url(ts_url)
        u_file.download_file(ts_url, file_name, save_dir, **_REQUESTS_KWARGS)
        log.info('download ts file success({}/{}): {}'.format(
            index, len(ts_urls), ts_url))
        index += 1
コード例 #11
0
def update_illustration_tag(illust_id, tag):
    illustration: Illustration = session.query(Illustration).get(illust_id)
    if illustration is None:
        log.info(
            'The illustration is not exist. illust_id: {}'.format(illust_id))
        return
    log.info('process illust_id: {}, set tag to: {} '.format(illust_id, tag))
    illustration.tag = tag
    session.commit()
コード例 #12
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def delete_file():
    delete_picture_paths = u_file.get_all_sub_files(r'result-delete')
    for delete_picture_path in delete_picture_paths:
        base_filename = os.path.split(delete_picture_path)[1]
        for index in range(30):
            source_filename = base_filename.replace('-1', '-' + str(index))
            source_path = os.path.join(r'result', source_filename)
            if not os.path.isfile(source_path):
                break
            log.info('move file: {}'.format(source_path))
コード例 #13
0
ファイル: u_file.py プロジェクト: youyouzh/python-base
def ready_dir(file_path: str):
    """
    准备相关文件夹,检查path所在文件夹是否存在,若不存在则创建
    :param file_path: 文件路径,不能是文件夹路径
    :return: None
    """
    dir_path = os.path.dirname(file_path)
    if not os.path.isdir(dir_path):
        log.info('the file path is not exist. create: {}'.format(dir_path))
        os.makedirs(dir_path)
コード例 #14
0
def download_pictures(url: str, title: str) -> list:
    html_content = u_file.get_content(url, encoding='UTF-8')
    soup = BeautifulSoup(html_content, 'lxml')

    img_elements = soup.select('figure.img-box')
    log.info('get book elements size: {}'.format(len(img_elements)))
    for img_element in img_elements:
        image_url = img_element.find('img')['data-src']
        image_url = 'http:' + re.sub(r"@[^\n]+", '-', image_url)
        u_file.download_file(image_url, title + '-' + u_file.get_file_name_from_url(image_url), r'result')
    return []
コード例 #15
0
ファイル: u_file.py プロジェクト: youyouzh/python-base
def get_all_sub_files_with_cache(root_path, contain_dir=False, use_cache=True):
    cache_file = os.path.join(get_abs_cache_path(),
                              convert_windows_path(root_path))
    if use_cache and os.path.isfile(cache_file):
        log.info('load content from cache: {}'.format(cache_file))
        return load_json_from_file(cache_file)
    else:
        ready_dir(cache_file)
        sub_files = get_all_sub_files(root_path, contain_dir=contain_dir)
        cache_json(sub_files, cache_file)
        return sub_files
コード例 #16
0
ファイル: download.py プロジェクト: youyouzh/python-base
def download_task_by_illust_ids():
    save_directory = r'G:\Projects\Python_Projects\python-base\spider\pixiv\crawler\result\illusts-2020'
    illust_ids = [
        83955499, 78914920, 85204622, 86387545, 87833548, 86825654, 87844590
    ]
    log.info('begin download illust by ids. lens: {}'.format(illust_ids))
    for illust_id in illust_ids:
        download_by_illustration_id(save_directory,
                                    illust_id,
                                    skip_download=False,
                                    split_r_18=False)
    log.info('end')
コード例 #17
0
def download_pins(pins: list, board_name: str):
    log.info('begin download board: {} pins image, size: {}'.format(
        board_name, len(pins)))
    save_dir = r'result'
    save_dir = os.path.join(save_dir, board_name)
    for pin in pins:
        u_file.download_file(pin['image_url'],
                             pin['id'],
                             path=save_dir,
                             **_REQUESTS_KWARGS)
    log.info('end download board: {} pins image, size: {}'.format(
        board_name, len(pins)))
コード例 #18
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def download_ts_file_with_pool(m3u8_url: str, ts_urls: List[str]):
    pool = ThreadPoolExecutor(10)
    save_dir = get_ts_ave_dir(m3u8_url)
    tasks = []
    for ts_url in ts_urls:
        file_name = u_file.get_file_name_from_url(ts_url)
        future = pool.submit(u_file.download_file, ts_url, file_name, save_dir,
                             **_REQUESTS_KWARGS)
        tasks.append(future)

    wait(tasks, return_when=ALL_COMPLETED)
    log.info('all ts file download success.')
コード例 #19
0
ファイル: collect.py プロジェクト: youyouzh/python-base
def update_sub_dir_illust_tag(parent_directory, tag):
    """
    将某个文件夹下的所有文件在illust数据库中的记录标记tag,支持两级文件夹
    :param parent_directory: 父级文件夹
    :param tag: 需要更新的标签
    :return: None
    """
    child_directories = os.listdir(parent_directory)
    for directory in child_directories:
        directory = os.path.join(parent_directory, directory)
        log.info('begin process directory: {}'.format(directory))
        update_dir_illust_tag(directory, tag)
コード例 #20
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def download_top():
    posts = query_top_score_posts(10000)
    directory = r'result'
    for post in posts:
        post = query_post(post.get('id'))
        if post.mark == 'downloaded':
            u_log.info('the post has been downloaded. id: {}'.format(post.id))
            continue
        u_log.info('begin download post. id: {}, score: {}, size: {}'.format(post.id, post.score, post.file_size))
        file_name = u_file.get_file_name_from_url(post.file_url)
        u_file.download_file(post.file_url, file_name, directory)
        mark_post(post, 'downloaded')
コード例 #21
0
ファイル: u_file.py プロジェクト: youyouzh/python-base
def load_json_from_file(json_file) -> dict:
    """
    从文件中加载json数据
    :param json_file:
    :return:
    """
    file_handle = open(json_file, encoding='utf-8')
    json_data = None
    if os.path.isfile(json_file):
        json_data = json.load(file_handle)
    file_handle.close()
    log.info('load json from file success. file: {}'.format(json_file))
    return json_data
コード例 #22
0
ファイル: download.py プロジェクト: youyouzh/python-base
def download_from_url_files(url_file_path, save_directory):
    # 创建文件夹
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS)
    pixiv_api.auth(refresh_token=_REFRESH_TOKEN)
    url_list = u_file.read_file_as_list(url_file_path)
    log.info('begin download image, url size: ' + str(len(url_list)))
    index = 0
    for url in url_list:
        log.info('index: ' + str(index))
        download_task(pixiv_api, save_directory, url)
        index += 1
コード例 #23
0
ファイル: arrange.py プロジェクト: youyouzh/python-base
def get_image_meta_infos(target_directory: str, use_cache=True) -> list:
    cache_file_path = get_cache_path(target_directory, 'meta-info', 'json')
    cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                   cache_file_path)
    if use_cache and os.path.isfile(cache_file_path):
        return json.load(open(cache_file_path, 'r', encoding='utf-8'))
    image_meta_infos = []

    image_paths = get_all_image_paths(target_directory, use_cache)
    log.info('total image file size: {}'.format(len(image_paths)))
    index = 0
    for image_path in image_paths:
        index += 1
        illust_id = get_illust_id(image_path)
        # log.info('get illust_id: {} ({}/{})'.format(illust_id, index, len(image_paths)))

        if illust_id < 0:
            log.warn(
                'The illust is not format. image_path: {}'.format(image_path))
            continue

        if not os.path.isfile(image_path):
            log.warn(
                'The illust was deleted. image_path: {}'.format(image_path))
            continue

        illustration: Illustration = session.query(Illustration).get(illust_id)
        if illustration is None:
            log.warn('The illustration is not exist. illust_id: {}'.format(
                illust_id))
            continue

        image_meta_infos.append({
            'width': illustration.width,
            'height': illustration.height,
            'path': image_path,
            'file_name': os.path.split(image_path)[1],
            'illust_id': illust_id,
            'user_id': illustration.user_id,
            'size': os.path.getsize(image_path),
            'r_18': illustration.r_18,
            'bookmarks': illustration.total_bookmarks,
            'tag': illustration.tag
        })
    log.info('get_image_meta_infos end. image size: {}'.format(
        len(image_meta_infos)))
    json.dump(image_meta_infos,
              open(cache_file_path, 'w', encoding='utf-8'),
              ensure_ascii=False,
              indent=4)
    return image_meta_infos
コード例 #24
0
ファイル: crawl.py プロジェクト: youyouzh/python-base
def get_video_notes(period_id: int) -> list:
    params = {
        '_ts_': '1621612527891',
        'periodId': str(period_id),
        'index': '1'
    }
    response = u_file.get_json('https://rt.qingwk.com/course/note/list', params=params)
    if 'data' not in response or 'datas' not in response['data']:
        log.error('The response has not notes')
        return []
    log.info('pageCount: {}, rowCount: {}'.format(response['data']['pageCount'], response['data']['rowCount']))
    notes = response['data']['datas']
    log.info('notes count: {}'.format(len(notes)))
    return notes
コード例 #25
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def get_all_page_book_list(template_url: str) -> list:
    max_page_size = 100
    book_infos = []
    for index in range(1, max_page_size):
        url = template_url.format(index)
        page_book_infos = get_book_list(url)
        if len(page_book_infos) == 0:
            log.warn('The book infos is empty. end crawler.')
            break
        book_infos.extend(page_book_infos)
        log.info('end crawler url: {}, book size: {}'.format(
            url, len(page_book_infos)))
        u_file.cache_json(book_infos, r'result/total_book_info.json')
    return book_infos
コード例 #26
0
ファイル: jiemo_crawler.py プロジェクト: youyouzh/python-base
def parse_and_save_grammar_json(file_path: str):
    """
    讲语法讲解存入数据库中
    :param file_path:
    :return:
    """
    grammar_categories = u_file.load_json_from_file(file_path)
    if not grammar_categories or not 'data' in grammar_categories:
        log.warn('The grammar json is invalid: {}'.format(str))
        return

    log.info('load grammar json success. category size: {}'.format(len(grammar_categories)))
    grammar_categories = grammar_categories.get('data')
    for grammar_category in grammar_categories:
        log.info('parse grammar category: {}'.format(grammar_category.get('title')))
        if grammar_category.get('title') != grammar_category.get('label'):
            log.warn('The grammar title and label is not same.')
        grammars = grammar_category.get('grammerList')
        log.info('parse grammar category sub grammar. category: {}, grammar size: {}'
                   .format(grammar_category.get('title'), len(grammars)))
        for grammar in grammars:
            if grammar.get('explain') != grammar.get('comment') or grammar.get('type') != grammar.get('category') \
                    or grammar.get('category') != grammar_category.get('title'):
                log.warn('The grammar category is special. grammar: {}'.format(grammar.get('grammar')))
            log.info('get grammar: {}'.format(grammar.get('grammar')))
            db_grammar = Grammar(id=grammar.get('id'), content=grammar.get('content'))
            db_grammar.level = grammar.get('level')
            db_grammar.category = grammar.get('category')
            db_grammar.type = grammar.get('category')
            db_grammar.link = grammar.get('link')
            db_grammar.explain = grammar.get('explain')
            db_grammar.example = re.sub('[#@][0-9]*', '', grammar.get('exmple'))
            db_grammar.postscript = grammar.get('ps')
            save_grammar(db_grammar)
コード例 #27
0
ファイル: file_util.py プロジェクト: youyouzh/python-base
def get_all_image_paths(image_directory: str, use_cache: bool = True, contain_dir=False) -> list:
    """
    递归获取某个文件夹下的所有图片和文件夹
    :param image_directory: 图片路径
    :param use_cache: 是否使用缓存
    :param contain_dir: 返回值是否包含目录
    :return: 图片绝对路径列表
    """
    log.info('begin get all image files from path: {}'.format(image_directory))
    if not os.path.isdir(image_directory):
        log.error('The image directory is not exist: {}'.format(image_directory))
        return []

    # 构建cache文件夹并检查是否存在cache
    cache_file_path = get_cache_path(image_directory, 'image_paths', 'txt')
    cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path)
    if use_cache and os.path.isfile(cache_file_path):
        # 存在缓存文件直接使用缓存
        log.info('read all image file from cache: {}'.format(cache_file_path))
        return u_file.read_file_as_list(cache_file_path)

    # 如果cache目录不存在,则创建
    if not os.path.isdir(os.path.split(cache_file_path)[0]):
        log.info('create the cache directory: {}'.format(cache_file_path))
        os.makedirs(os.path.split(cache_file_path)[0])
    all_files = u_file.get_all_sub_files(image_directory, contain_dir=contain_dir)

    # 将结果存入cache
    cache_file_path_handler = open(cache_file_path, 'w+', encoding='utf-8')
    for file in all_files:
        cache_file_path_handler.writelines(file + '\n')
    cache_file_path_handler.close()
    log.info('get_all_image_files finish. file size: {}'.format(len(all_files)))
    return all_files
コード例 #28
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def decrypt_aes(m3u8_url: str, encrypt_data):
    # get decrypt key
    key_url = urljoin(m3u8_url, 'key.key')
    parse_url = urlparse(key_url)
    cache_file = os.path.join(r'result\m3u8',
                              u_file.convert_windows_path(parse_url.path))
    key = u_file.get_content_with_cache(key_url, cache_file)
    log.info('get key success: {}'.format(key))

    # aes decrypt input
    iv = b'0000000000000000'
    cipher = AES.new(key.encode('utf-8'), AES.MODE_CBC, iv)
    decrypt_data = cipher.decrypt(encrypt_data)
    return decrypt_data.rstrip(b'\0')
コード例 #29
0
ファイル: crawler.py プロジェクト: youyouzh/python-base
def extract_m3u8_url(html_content: str) -> str or None:
    pattern = re.compile(r'player_aaaa=(\{.+\})')
    search_content = re.search(pattern, html_content)
    if search_content is None:
        log.error('Can not match any m3u8 url.')
        exit(0)
        return None
    init_json = search_content.group(1)
    json_data = json.loads(init_json)
    if 'url' not in json_data:
        log.error('Can not find url: {}'.format(init_json))
        return None
    log.info('extract url: {}'.format(json_data['url']))
    return json_data['url']
コード例 #30
0
ファイル: u_file.py プロジェクト: youyouzh/python-base
def read_content(file_path):
    """
    read content from file, use UTF-8 encoding
    :param file_path: target file path
    :return: file content
    """
    if not os.path.isfile(file_path):
        log.warn('The file is not exist')
        return None
    log.info('read content from file: {}'.format(file_path))
    fin = open(file_path, 'r', encoding='UTF-8')
    content = fin.read()
    fin.close()
    return content