Beispiel #1
0
def fetch_post_image_links(url, post_pages):
    print_log('开始收集帖子全部图片链接')
    post_content_reg = re.compile(setting.post_content_pattern, re.S)
    link_reg = re.compile(setting.img_link_pattern)
    link_reg2 = re.compile(setting.img_link_with_third_site_pattern)
    img_links = []
    for page in range(1, post_pages + 1):
        print '当前页数:%s\r' % page,
        curl_page_url = utils.make_url_with_page_num(url, page)
        whole_content = get_url_content(curl_page_url)
        if whole_content:
            content = (re.findall(post_content_reg, whole_content))[0]
            origin_img_links = re.findall(link_reg, content) + re.findall(link_reg2, content)
            img_links += [utils.make_real_img_link(link) for link in origin_img_links]
    print_log('收集链接完毕')
    return sorted(utils.clean_str_list(img_links))
Beispiel #2
0
def remove_repeat_img_links(record_file_name, img_links):
    if os.path.exists(record_file_name):
        download_records_file = open(record_file_name, 'r+')
        # 下载过的图片链接
        existed_links = utils.clean_str_list(download_records_file.readlines())
        # 新增链接(所有链接减去已下载链接)
        new_links = sorted(list(set(img_links) - set(existed_links)))
        # 将新增连接追加写入至记录文件 (废弃,改为下好一张存一张的地址)
        # download_records_file.writelines([link + '\n' for link in new_links])
        download_records_file.close()
        return new_links
    else:
        # download_records_file = open(record_file_path, 'w')
        # download_records_file.writelines([link + '\n' for link in img_links])
        # download_records_file.close()
        return img_links