Beispiel #1
0
 def createComment(self, data):
     return Comment(self, BeautifulSoup.Comment(data))
Beispiel #2
0
def convert_page(page_path: str,
                 parser: str = 'auto',
                 callback: Callable[[str, str, str], None] = lambda *_: None,
                 ignore_errors: bool = False,
                 ignore_images: bool = False,
                 ignore_css: bool = False,
                 ignore_js: bool = False) -> str:
    """Take an HTML file or URL and outputs new HTML with resources as data URIs.

    Parameters:
        page_path (str): URL or path of web page to convert.
    Keyword Arguments:
        parser (str): HTML Parser for Beautiful Soup 4 to use. See
            `BS4's docs. <http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser>`_
            Default: 'auto' - Not an actual parser, but tells the library to
            automatically choose a parser.
        ignore_errors (bool): If ``True`` do not abort on unreadable resources.
            Unprocessable tags (e.g. broken links) will simply be skipped.
            Default: ``False``
        ignore_images (bool): If ``True`` do not process ``<img>`` tags.
            Default: ``False``
        ignore_css (bool): If ``True`` do not process ``<link>`` (stylesheet) tags.
            Default: ``False``
        ignore_js (bool): If ``True`` do not process ``<script>`` tags.
            Default: ``False``
        callback (function): Called before a new resource is processed. Takes
            three parameters: message type ('INFO' or 'ERROR'), a string with
            the category of the callback (usually the tag related to the
            message), and the message data (usually a string to be printed).
    Returns:
        str: The new webpage HTML.
    Raises:
        OSError: Error reading a file
        ValueError: Problem with a path/URL
        requests.exceptions.RequestException: Problem getting remote resource
        NameError: HTMLArk requires Requests to be installed to get resources
            from the web. This error is raised when an external URL is
            encountered.
    Examples:
        A very basic conversion of a local HTML file, using default settings:

        >>> convert_page("webpage.html")
        <Converted page HTML>

        However, that example will fail if there are any problems accessing
        linked resources in the HTML (e.g. a missing image). If you cannot
        verify the validity of links ahead of time (converting a downloaded
        web page, for example) you can disable failing on error:

        >>> convert_page("brokenpage.html", ignore_errors=True)
        <Converted page HTML, tags with broken links untouched>

        You can also skip processing of content types:

        >>> convert_page("webpage.html", ignore_images=True)
        <Converted page HTML, with <img> tags untouched>

        If you want to get feedback on the progress of the conversion, you can
        define a callback function. For example, a callback that prints all
        CSS-related errors to stdout (note that ignore_errors will bypass
        broken links but still report them to the callback):

        >>> def mycallback(message_type, message_category, message):
        ...     if message_type == 'ERROR' and message_category == 'link':
        ...         print(message)
        >>> convert_page("badcss.html", ignore_errors=True, callback=mycallback)
        <Converted page HTML, CSS links untouched, CSS errors printed to screen>
    """
    # Check features
    if requests_get is None:
        callback('INFO', 'feature',
                 "Requests not available, web downloading disabled")

    # Get page HTML, whether from a server, a local file, or stdin
    if page_path is None:
        # Encoding is unknown, read as bytes (let bs4 handle decoding)
        page_text = sys.stdin.buffer.read()
    else:
        _, page_text = _get_resource(page_path)

    # Not all parsers are equal - it can be specified on the command line
    # so the user can try another when one fails
    if parser == 'auto':
        parser = get_available_parsers()[0]
    soup = bs4.BeautifulSoup(page_text, parser)
    callback('INFO', 'parser', "Using parser " + parser)

    tags = []

    # Gather all the relevant tags together
    if not ignore_images:
        tags += soup('img')
    if not ignore_css:
        csstags = soup('link')
        for css in csstags:
            if 'stylesheet' in css['rel']:
                tags.append(css)
    if not ignore_js:
        scripttags = soup('script')
        for script in scripttags:
            if 'src' in script.attrs:
                tags.append(script)

    # Convert the linked resources
    for tag in tags:
        tag_url = tag['href'] if tag.name == 'link' else tag['src']
        try:
            # BUG: doesn't work if using relative remote URLs in a local file
            fullpath = urljoin(page_path, tag_url)
            tag_mime, tag_data = _get_resource(fullpath)
        except RequestException:
            callback('ERROR', tag.name, "Can't access URL " + fullpath)
            if not ignore_errors:
                raise
        except OSError as e:
            callback('ERROR', tag.name,
                     "Error reading '{}': {}".format(e.filename, e.strerror))
            if not ignore_errors:
                raise
        except ValueError as e:
            # Raised when a problem with the URL is found
            scheme = e.args[1]
            # Don't need to process things that are already data URIs
            if scheme == 'data':
                callback('INFO', tag.name, "Already data URI")
            else:
                # htmlark can only get from http/https and local files
                callback('ERROR', tag.name,
                         "Unknown protocol in URL: " + tag_url)
                if not ignore_errors:
                    raise
        except NameError as e:
            # Requests module is not available
            callback('ERROR', tag.name, str(e))
            if not ignore_errors:
                raise
        else:
            encoded_resource = make_data_uri(tag_mime, tag_data)
            if tag.name == 'link':
                tag['href'] = encoded_resource
            else:
                tag['src'] = encoded_resource
            callback('INFO', tag.name, tag_url)
        # Record the original URL so the original HTML can be recovered
        tag.insert_after(bs4.Comment("URL:" + tag_url))

    soup.html.insert_after(
        bs4.Comment("Generated by HTMLArk {}. Original URL {}".format(
            datetime.now(), page_path)))
    return str(soup)
Beispiel #3
0
 def createComment(self, data):
     from .Comment import Comment
     return Comment(self, bs4.Comment(data))
Beispiel #4
0
def appendComment(bs, text=''):
    '''Append a comment 'Tag' with the specified text'''
    bs.append(bs4.Comment(text))
Beispiel #5
0
 def createComment(self, data):
     from .Comment import Comment
     return Comment(self, BeautifulSoup.Comment(data))