def download_ts_file(m3u8_url: str, ts_urls: List[str]): save_dir = get_ts_ave_dir(m3u8_url) index = 1 for ts_url in ts_urls: file_name = u_file.get_file_name_from_url(ts_url) u_file.download_file(ts_url, file_name, save_dir, **_REQUESTS_KWARGS) log.info('download ts file success({}/{}): {}'.format( index, len(ts_urls), ts_url)) index += 1
def download_pictures(url: str, title: str) -> list: html_content = u_file.get_content(url, encoding='UTF-8') soup = BeautifulSoup(html_content, 'lxml') img_elements = soup.select('figure.img-box') log.info('get book elements size: {}'.format(len(img_elements))) for img_element in img_elements: image_url = img_element.find('img')['data-src'] image_url = 'http:' + re.sub(r"@[^\n]+", '-', image_url) u_file.download_file(image_url, title + '-' + u_file.get_file_name_from_url(image_url), r'result') return []
def download_pins(pins: list, board_name: str): log.info('begin download board: {} pins image, size: {}'.format( board_name, len(pins))) save_dir = r'result' save_dir = os.path.join(save_dir, board_name) for pin in pins: u_file.download_file(pin['image_url'], pin['id'], path=save_dir, **_REQUESTS_KWARGS) log.info('end download board: {} pins image, size: {}'.format( board_name, len(pins)))
def download_top(): posts = query_top_score_posts(10000) directory = r'result' for post in posts: post = query_post(post.get('id')) if post.mark == 'downloaded': u_log.info('the post has been downloaded. id: {}'.format(post.id)) continue u_log.info('begin download post. id: {}, score: {}, size: {}'.format(post.id, post.score, post.file_size)) file_name = u_file.get_file_name_from_url(post.file_url) u_file.download_file(post.file_url, file_name, directory) mark_post(post, 'downloaded')
def download_tag(): tag = 'f******o' posts = query_posts_by_tag(tag) directory = r'result' + '\\' + tag for post in posts: post = query_post(post.get('id')) if post.mark == 'downloaded': u_log.info('the post has been downloaded. id: {}'.format(post.id)) continue if post.score < 30: u_log.info('the post score is low. id: {}, score: {}'.format(post.id, post.score)) continue u_log.info('begin download post. id: {}, score: {}, size: {}'.format(post.id, post.score, post.file_size)) file_name = u_file.get_file_name_from_url(post.file_url) u_file.download_file(post.file_url, file_name, directory) mark_post(post, 'downloaded')
def download_image_collect(image_collect: dict, save_dir=r'result'): html_content = u_file.get_content(image_collect['url'], encoding='gb2312') soup = BeautifulSoup(html_content, 'lxml') image_collection_img_elements = soup.select('ul#showImg > li img') image_count = len(image_collection_img_elements) log.info('The image collect image size: {}'.format(image_count)) # image_download_button_element = soup.select('span#kk > a') # full_image_url = image_download_button_element['href'] # full_image_url = full_image_url.replace('http://cj.jj20.com/2020/down.html?picurl=', 'http://pic.jj20.com') for image_collection_img_element in image_collection_img_elements: current_image_url = image_collection_img_element['src'] current_image_url = current_image_url.replace('-lp', '') current_image_url = 'http:' + current_image_url filename = image_collect[ 'title'] + '-' + u_file.get_file_name_from_url(current_image_url) u_file.download_file(current_image_url, filename, save_dir)
def through_pose(url): log.info('begin through url'.format(url)) url_template = url.replace('pose_0001', 'pose_%04d') path = os.path.abspath(r'./result') if not os.path.isdir(path): os.mkdir(path) for index in range(1, 5): pose_url = url_template % index name = re.sub( r"[\\/?*<>|\":]+", '-', pose_url.replace(r'http://www.posemaniacs.com/pose/', '')) # 名称分组文件夹 for (path_key, path_value) in PATH_MAP.items(): if name.find(path_key) >= 0: path = path.replace('result', path_value) break log.info('begin download image from url: {}'.format(pose_url)) download_status = u_file.download_file(pose_url, name=name, path=path) if not download_status: log.info('download end. index: {}'.format(index)) break
def test_download_file(): url = 'http://aod.cos.tx.xmcdn.com/group20/M01/7E/F8/wKgJJ1eoW8uBquKEACmsecPrn1o863.m4a' file_name = '19663334' file_path = r'cache' u_file.download_file(url, file_name, file_path) u_unittest.assert_true(os.path.isfile(r'cache\19663334.m4a'))