Ejemplo n.º 1
0
def fill_download_url(book_infos: list) -> list:
    log.info('total book infos size: {}'.format(len(book_infos)))
    for book_info in book_infos:
        if 'download_url' in book_info:
            log.info(
                'This books has filled download_url. {}'.format(book_info))
            continue
        html_content = u_file.get_content(book_info['download_page'],
                                          encoding='gb2312')

        # 返回结果通过js处理成document
        download_info_pattern = re.compile(
            r'_downInfo = (\{Address:.+\})</script>')
        address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID')

        search_download_content = re.search(download_info_pattern,
                                            html_content)
        search_address_content = re.search(address_pattern, html_content)
        if search_address_content is None:
            log.error('Can not match any data.')
            continue

        download_address = search_address_content.group(1)
        log.info('download_info: {}'.format(search_download_content.group(1)))

        book_info['download_url'] = DOWNLOAD_BASE_URL + download_address
        book_info['download_info'] = search_download_content.group(1)
        u_file.cache_json(book_infos, r'result/full_book_infos.json')
    return book_infos
Ejemplo n.º 2
0
def crawl_video_info(template_page_url: str):
    max_page = 140
    video_infos = []
    parse_url = urlparse(template_page_url)
    for index in range(1, max_page):
        log.info('begin crawl page.({}/{})'.format(index, max_page))
        html_content = u_file.get_content(template_page_url.format(index))
        soup = BeautifulSoup(html_content, 'lxml')

        video_nodes = soup.select('div.stui-vodlist__detail')
        log.info('video size: {}'.format(len(video_nodes)))
        for video_node in video_nodes:
            a_node = video_node.select_one('h4 > a')
            span_node = video_node.select('p.sub > span')
            view_count = int(span_node[2].text.strip())
            like_count = int(span_node[1].text.strip())
            video_infos.append({
                'title':
                a_node.string,
                'url':
                parse_url._replace(path=a_node['href']).geturl(),
                'view':
                view_count,
                'like':
                like_count
            })
        video_infos.sort(key=lambda x: x['like'], reverse=True)
        u_file.cache_json(video_infos, r'result\video-infos.jon')
    return video_infos
Ejemplo n.º 3
0
def download_by_page_url(page_url: str):
    """
    下载 hsck.us
    :param page_url: 视频页面地址
    :return: None
    """
    response = u_file.get_content(page_url)
    m3u8_url = extract_m3u8_url(response)
    title = extract_title(response)
    download_with_m3u8_url(title, m3u8_url)
Ejemplo n.º 4
0
def download_pictures(url: str, title: str) -> list:
    html_content = u_file.get_content(url, encoding='UTF-8')
    soup = BeautifulSoup(html_content, 'lxml')

    img_elements = soup.select('figure.img-box')
    log.info('get book elements size: {}'.format(len(img_elements)))
    for img_element in img_elements:
        image_url = img_element.find('img')['data-src']
        image_url = 'http:' + re.sub(r"@[^\n]+", '-', image_url)
        u_file.download_file(image_url, title + '-' + u_file.get_file_name_from_url(image_url), r'result')
    return []
Ejemplo n.º 5
0
def crawler():
    log.info('------begin crawler------')
    crawl_count = 1
    begin_url = CONFIG.get('host') + (CONFIG.get("page_url_template") %
                                      crawl_count)
    html_content = u_file.get_content(begin_url)
    while html_content and crawl_count <= CONFIG.get('crawl_max_count'):
        pose_image_urls = extract_pose_urls(html_content)
        log.info("extract pose image urls success. size: {}".format(
            len(pose_image_urls)))
        for pose_image_url in pose_image_urls:
            log.info(
                "begin crawl from pose image url: {}".format(pose_image_url))
            through_pose(pose_image_url)
            log.info(
                "end crawl from pose image url: {}".format(pose_image_url))
        crawl_count += 1
        html_content = u_file.get_content(
            CONFIG.get('host') +
            (CONFIG.get("page_url_template") % crawl_count))
    log.info('------end crawler------')
Ejemplo n.º 6
0
def get_image_list(url: str) -> list:
    html_content = u_file.get_content(url, encoding='gb2312')
    soup = BeautifulSoup(html_content, 'lxml')

    # find image collection
    image_li_elements = soup.select('ul.picbz > li')
    log.info('image li list siz: {}'.format(len(image_li_elements)))

    image_collects = []
    for image_li_element in image_li_elements:
        a_element = image_li_element.findAll('a')
        a_element = a_element[1]
        image_collects.append({
            'url': 'http://www.jj20.com' + a_element['href'],
            'title': a_element.string
        })
    return image_collects
Ejemplo n.º 7
0
def build_content_html():
    target_album_id = 4815905
    track_infos = get_album_track_info_from_cache(target_album_id)
    template_content = u_file.get_content(r'cache\template.html')

    content = ''
    for track_info in track_infos:
        if track_info.get('title').find('坂本真绫') < 0:
            u_log.info('The track is not need. title: {}'.format(track_info.get('title')))
            continue
        if track_info.get('title').find('合集') >= 0:
            u_log.info('The track is collection. title: {}'.format(track_info.get('title')))
            continue
        content += '\n\n<h2>' + track_info.get('title') + '</h2>\n\n'
        content += track_info.get('richIntro')
    template_content.replace('{content}', content)
    u_file.write_content(r'cache\target.html', content)
Ejemplo n.º 8
0
def get_all_urls(url: str) -> list:
    html_content = u_file.get_content(url, encoding='UTF-8')
    soup = BeautifulSoup(html_content, 'lxml')

    infos = []
    comment_node = soup.select('div.is-top p.text')
    texts = comment_node[0].string.split('\n')
    a_nodes = comment_node[0].find('img')

    index = 1
    for a_node in a_nodes:
        infos.append({
            'url': a_node.href,
            'title': texts[index]
        })
        index += 1
    return infos
Ejemplo n.º 9
0
def download_image_collect(image_collect: dict, save_dir=r'result'):
    html_content = u_file.get_content(image_collect['url'], encoding='gb2312')
    soup = BeautifulSoup(html_content, 'lxml')

    image_collection_img_elements = soup.select('ul#showImg > li img')
    image_count = len(image_collection_img_elements)
    log.info('The image collect image size: {}'.format(image_count))

    # image_download_button_element = soup.select('span#kk > a')
    # full_image_url = image_download_button_element['href']
    # full_image_url = full_image_url.replace('http://cj.jj20.com/2020/down.html?picurl=', 'http://pic.jj20.com')
    for image_collection_img_element in image_collection_img_elements:
        current_image_url = image_collection_img_element['src']
        current_image_url = current_image_url.replace('-lp', '')
        current_image_url = 'http:' + current_image_url
        filename = image_collect[
            'title'] + '-' + u_file.get_file_name_from_url(current_image_url)
        u_file.download_file(current_image_url, filename, save_dir)
Ejemplo n.º 10
0
def get_book_list(url: str) -> list:
    html_content = u_file.get_content(url, encoding='gb2312')
    soup = BeautifulSoup(html_content, 'lxml')

    book_elements = soup.select('li.item > a')
    log.info('get book elements size: {}'.format(len(book_elements)))

    book_infos = []
    for book_element in book_elements:
        book_infos.append({
            'download_page':
            BASE_HOST + book_element['href'],
            'cover_image_url':
            book_element.find('img', {'class': 'tu'})['src'],
            'title':
            book_element.select('div.info > p.name')[0].string,
            'update_time':
            book_element.select('div.info > p.type > span')[0].string,
            'size':
            book_element.select('div.info > p.type > span')[1].string
        })
    u_file.cache_json(book_infos, r'result/book_info.json')
    return book_infos