def download(page_url, output=DEFAULT_OUTPUT): log.info(f"Loading page {page_url}") html = load(page_url) log.debug(f"{page_url} loaded") html_path = os.path.join(output, url.to_file_name(page_url, ".html")) if os.path.exists(html_path): raise errors.SavingError(f"SavingError: '{html_path}' exists") dir_path = os.path.join(output, url.to_dir_name(page_url)) if os.path.exists(dir_path): raise errors.SavingError(f"SavingError: '{dir_path}' exists") html_handled, resources = dom.prepare_html(html, page_url, dir_path) log.info(f"Saving page '{html_path}'") storage.save(html_handled, os.path.abspath(html_path)) log.debug("HTML '{html_path}' saved") if resources: storage.create_directory(dir_path) log.debug(f"Directory '{dir_path}' created") log.info(f"Downloading resources from {page_url}") download_resources(resources) return html_path
def download(url, user_path): """ download(url, path_to_user_directory): Downloads the page at the specified url and saves it to a html-file in the user folder. And return the path to the saved local file. """ page_name = urls.to_name(url) if os.path.exists(user_path): file_path = f'{user_path}{os.sep}{page_name}.html' else: logging.error(f'Directory {user_path} not exist') raise Exception( f'folder "{user_path}" not found, loading {url} aborted', ) response = requests.get(url) if not response.ok: logging.error( f'GET-request for {url} failed. Code: {response.status_code}', ) response.raise_for_status() html_content, sources = local_html.prepare( url, page_name, response.content, ) logging.info(f'Saving page to {file_path}') storage.save(html_content, file_path) if sources is not None: logging.info('Start sources (images, links, scripts) load') storage.create(user_path, f'{page_name}_files') download_sources(user_path + os.sep, sources) logging.info('Sources loading complete') return file_path
def run(url, path): logger.info(f'Start loading {url}') html_file_name = storage.get_html_file_name(url) folder_name = storage.get_folder_name(html_file_name) html_file_path = os.path.join(path, html_file_name) directory = os.path.join(path, folder_name) # Create folder for local resources storage.create_folder(directory) # Get data from HTML response = make_request(url, mandatory=True).text html_data = BeautifulSoup(response, 'html.parser') # Get local resources and change links in html file local_resource = storage.find_local_resources(html_data, url, directory) # Save html file storage.save(html_file_path, html_data.prettify()) logger.info(f'HTML file downloaded to: {html_file_path}') # Save local resources storage.save_local_resource(local_resource) logger.info('Download complite!')
def test_save(): with tempfile.TemporaryDirectory() as tmpdir: file_name = os.path.join(str(tmpdir), 'new.jpg') with open('./tests/fixtures/image/bs.jpg', 'rb') as ef: expected_file = ef.read() save(file_name, expected_file, write_mod='wb') with open(file_name, 'rb') as nf: new_file = nf.read() assert new_file == expected_file
def download_sources(main_path, paths_and_links): bar_limit = len(paths_and_links) bar = Bar('Loading', suffix='%(percent)d%%', max=bar_limit) for path, link in paths_and_links: path_for_save = main_path + path source = requests.get(link) if source.ok: logging.info(f'Source {link} was load') storage.save(source.content, path_for_save) logging.info(f'It was save to {path_for_save}') else: logging.warning( f'Source {link} not load. Code: {source.status_code}', ) bar.next()
def download_resources(url_path: str, directory: str, resources: List[Tuple[str, str]]) -> None: with Bar('Processing', max=len(resources)) as bar: for resource_url, resource_path in resources: logging.info('Start load resource `%s`', resource_url) resource_path = os.path.join(directory, resource_path) try: response = make_request(urljoin(url_path, resource_url)) storage.save(resource_path, response.content) except requests.HTTPError as e: logging.warning(str(e)) continue bar.next() logging.info('Resource loaded `%s` -> `%s`', resource_url, resource_path)
def download_resources(resources: dict): bar = PixelBar("\U0001F4E5 Downloading resources", max=len(resources)) for resource_url, resource_path in resources.items(): try: path = os.path.abspath(resource_path) content = load(resource_url) log.debug(f"{resource_url} loaded") storage.save(content, path) log.debug(f"'{resource_path}' saved") except (errors.DownloadingError, errors.SavingError): pass finally: bar.next() bar.finish()
def download(url, dir_for_save='.'): name_file, parsed_url, path_to_file = parse(url, dir_for_save) name_dir = splitext(name_file)[0] + '_files' try: output_text_html, assets = prepare(get(url), url, name_dir) logging.info('List of assets:\n{a}'.format(a='\n'.join(assets))) storage.save(path_to_file, output_text_html) download_assets(assets, name_dir, dir_for_save) return path_to_file except exceptions.RequestException as error: logging.warning('Error request: {a}'.format(a=error)) raise except OSError as error: logging.warning('Error writing of file: {a}'.format(a=error)) raise
def download(url_path: str, directory: str) -> str: logging.info('Start load web page `%s` to `%s`', url_path, directory) directory = os.path.abspath(directory) assert_directory(directory) response = make_request(url_path) html_content, resources = html.process(url_path, response.content) file_name = url.to_file_name(url_path, force_extension='html') abs_file_path = os.path.join(directory, file_name) storage.save(abs_file_path, html_content) if resources: download_resources(url_path, directory, resources) logging.info('Web page loaded `%s` -> `%s`', url_path, abs_file_path) return abs_file_path
def download_assets(assets, name_dir, dir_for_save): try: if assets: make_dir(join(dir_for_save, name_dir)) except OSError as error: logging.warning('Error writing of file: {a}'.format(a=error)) raise for path_asset, url_asset in assets.items(): with Bar( 'Loading {asset}: '.format(asset=unquote(url_asset)), max=len(assets) / 100, suffix='%(percent)d%%') as bar_asset: try: content_asset = get(url_asset) storage.save(join(dir_for_save, path_asset), content_asset) bar_asset.next() except exceptions.RequestException as error: logging.warning('Error request: {a}'.format(a=error)) except OSError as error: logging.warning('Error writing of file: {a}'.format(a=error))
def test_save_exceptions(): with pytest.raises(PermissionError) as error: save('/new.html', HELLO) assert 'Permission denied' in str(error)