Ejemplo n.º 1
0
def download(page_url, output=DEFAULT_OUTPUT):
    log.info(f"Loading page {page_url}")
    html = load(page_url)
    log.debug(f"{page_url} loaded")

    html_path = os.path.join(output, url.to_file_name(page_url, ".html"))
    if os.path.exists(html_path):
        raise errors.SavingError(f"SavingError: '{html_path}' exists")

    dir_path = os.path.join(output, url.to_dir_name(page_url))
    if os.path.exists(dir_path):
        raise errors.SavingError(f"SavingError: '{dir_path}' exists")

    html_handled, resources = dom.prepare_html(html, page_url, dir_path)
    log.info(f"Saving page '{html_path}'")
    storage.save(html_handled, os.path.abspath(html_path))
    log.debug("HTML '{html_path}' saved")

    if resources:
        storage.create_directory(dir_path)
        log.debug(f"Directory '{dir_path}' created")
        log.info(f"Downloading resources from {page_url}")
        download_resources(resources)

    return html_path
def download(url, user_path):
    """
    download(url, path_to_user_directory):
    Downloads the page at the specified url
    and saves it to a html-file in the user folder.
    And return the path to the saved local file.
    """
    page_name = urls.to_name(url)
    if os.path.exists(user_path):
        file_path = f'{user_path}{os.sep}{page_name}.html'
    else:
        logging.error(f'Directory {user_path} not exist')
        raise Exception(
            f'folder "{user_path}" not found, loading {url} aborted', )
    response = requests.get(url)
    if not response.ok:
        logging.error(
            f'GET-request for {url} failed. Code: {response.status_code}', )
        response.raise_for_status()
    html_content, sources = local_html.prepare(
        url,
        page_name,
        response.content,
    )
    logging.info(f'Saving page to {file_path}')
    storage.save(html_content, file_path)
    if sources is not None:
        logging.info('Start sources (images, links, scripts) load')
        storage.create(user_path, f'{page_name}_files')
        download_sources(user_path + os.sep, sources)
        logging.info('Sources loading complete')
    return file_path
Ejemplo n.º 3
0
def run(url, path):
    logger.info(f'Start loading {url}')

    html_file_name = storage.get_html_file_name(url)
    folder_name = storage.get_folder_name(html_file_name)
    html_file_path = os.path.join(path, html_file_name)
    directory = os.path.join(path, folder_name)

    # Create folder for local resources
    storage.create_folder(directory)

    # Get data from HTML
    response = make_request(url, mandatory=True).text
    html_data = BeautifulSoup(response, 'html.parser')

    # Get local resources and change links in html file
    local_resource = storage.find_local_resources(html_data, url, directory)

    # Save html file
    storage.save(html_file_path, html_data.prettify())
    logger.info(f'HTML file downloaded to: {html_file_path}')

    # Save local resources
    storage.save_local_resource(local_resource)
    logger.info('Download complite!')
Ejemplo n.º 4
0
def test_save():
    with tempfile.TemporaryDirectory() as tmpdir:
        file_name = os.path.join(str(tmpdir), 'new.jpg')
        with open('./tests/fixtures/image/bs.jpg', 'rb') as ef:
            expected_file = ef.read()
            save(file_name, expected_file, write_mod='wb')
            with open(file_name, 'rb') as nf:
                new_file = nf.read()
                assert new_file == expected_file
def download_sources(main_path, paths_and_links):
    bar_limit = len(paths_and_links)
    bar = Bar('Loading', suffix='%(percent)d%%', max=bar_limit)
    for path, link in paths_and_links:
        path_for_save = main_path + path
        source = requests.get(link)
        if source.ok:
            logging.info(f'Source {link} was load')
            storage.save(source.content, path_for_save)
            logging.info(f'It was save to {path_for_save}')
        else:
            logging.warning(
                f'Source {link} not load. Code: {source.status_code}', )
        bar.next()
Ejemplo n.º 6
0
def download_resources(url_path: str, directory: str,
                       resources: List[Tuple[str, str]]) -> None:
    with Bar('Processing', max=len(resources)) as bar:
        for resource_url, resource_path in resources:
            logging.info('Start load resource `%s`', resource_url)
            resource_path = os.path.join(directory, resource_path)
            try:
                response = make_request(urljoin(url_path, resource_url))
                storage.save(resource_path, response.content)
            except requests.HTTPError as e:
                logging.warning(str(e))
                continue
            bar.next()
            logging.info('Resource loaded `%s` -> `%s`', resource_url,
                         resource_path)
Ejemplo n.º 7
0
def download_resources(resources: dict):
    bar = PixelBar("\U0001F4E5 Downloading resources", max=len(resources))
    for resource_url, resource_path in resources.items():
        try:
            path = os.path.abspath(resource_path)

            content = load(resource_url)
            log.debug(f"{resource_url} loaded")

            storage.save(content, path)
            log.debug(f"'{resource_path}' saved")
        except (errors.DownloadingError, errors.SavingError):
            pass
        finally:
            bar.next()
    bar.finish()
def download(url, dir_for_save='.'):

    name_file, parsed_url, path_to_file = parse(url, dir_for_save)
    name_dir = splitext(name_file)[0] + '_files'
    try:
        output_text_html, assets = prepare(get(url), url, name_dir)
        logging.info('List of assets:\n{a}'.format(a='\n'.join(assets)))
        storage.save(path_to_file, output_text_html)
        download_assets(assets, name_dir, dir_for_save)
        return path_to_file
    except exceptions.RequestException as error:
        logging.warning('Error request: {a}'.format(a=error))
        raise
    except OSError as error:
        logging.warning('Error writing of file: {a}'.format(a=error))
        raise
Ejemplo n.º 9
0
def download(url_path: str, directory: str) -> str:
    logging.info('Start load web page `%s` to `%s`', url_path, directory)

    directory = os.path.abspath(directory)
    assert_directory(directory)

    response = make_request(url_path)
    html_content, resources = html.process(url_path, response.content)

    file_name = url.to_file_name(url_path, force_extension='html')
    abs_file_path = os.path.join(directory, file_name)
    storage.save(abs_file_path, html_content)

    if resources:
        download_resources(url_path, directory, resources)

    logging.info('Web page loaded `%s` -> `%s`', url_path, abs_file_path)

    return abs_file_path
Ejemplo n.º 10
0
def download_assets(assets, name_dir, dir_for_save):
    try:
        if assets:
            make_dir(join(dir_for_save, name_dir))
    except OSError as error:
        logging.warning('Error writing of file: {a}'.format(a=error))
        raise
    for path_asset, url_asset in assets.items():
        with Bar(
                'Loading {asset}: '.format(asset=unquote(url_asset)),
                max=len(assets) / 100,
                suffix='%(percent)d%%') as bar_asset:
            try:
                content_asset = get(url_asset)
                storage.save(join(dir_for_save, path_asset), content_asset)
                bar_asset.next()
            except exceptions.RequestException as error:
                logging.warning('Error request: {a}'.format(a=error))
            except OSError as error:
                logging.warning('Error writing of file: {a}'.format(a=error))
Ejemplo n.º 11
0
def test_save_exceptions():
    with pytest.raises(PermissionError) as error:
        save('/new.html', HELLO)
    assert 'Permission denied' in str(error)