Beispiel #1
0
    def _fetch_url(self, url):
        """Download the given url and add the file to the collection.

        Args:
            url: The file to download as QUrl.
        """
        if url.scheme() not in ['http', 'https']:
            return
        # Prevent loading an asset twice
        if url in self.loaded_urls:
            return
        self.loaded_urls.add(url)

        log.downloads.debug("loading asset at {}".format(url))

        # Using the download manager to download host-blocked urls might crash
        # qute, see the comments/discussion on
        # https://github.com/qutebrowser/qutebrowser/pull/962#discussion_r40256987
        # and https://github.com/qutebrowser/qutebrowser/issues/1053
        host_blocker = objreg.get('host-blocker')
        if host_blocker.is_blocked(url):
            log.downloads.debug("Skipping {}, host-blocked".format(url))
            # We still need an empty file in the output, QWebView can be pretty
            # picky about displaying a file correctly when not all assets are
            # at least referenced in the mhtml file.
            self.writer.add_file(urlutils.encoded_url(url), b'')
            return

        download_manager = objreg.get('qtnetwork-download-manager')
        target = downloads.FileObjDownloadTarget(_NoCloseBytesIO())
        item = download_manager.get(url, target=target, auto_remove=True)
        self.pending_downloads.add((url, item))
        item.finished.connect(functools.partial(self._finished, url, item))
        item.error.connect(functools.partial(self._error, url, item))
        item.cancelled.connect(functools.partial(self._cancelled, url, item))
Beispiel #2
0
    def _fetch_url(self, url):
        """Download the given url and add the file to the collection.

        Args:
            url: The file to download as QUrl.
        """
        if url.scheme() not in {"http", "https"}:
            return
        # Prevent loading an asset twice
        if url in self.loaded_urls:
            return
        self.loaded_urls.add(url)

        log.downloads.debug("loading asset at {}".format(url))

        # Using the download manager to download host-blocked urls might crash
        # qute, see the comments/discussion on
        # https://github.com/The-Compiler/qutebrowser/pull/962#discussion_r40256987
        # and https://github.com/The-Compiler/qutebrowser/issues/1053
        host_blocker = objreg.get("host-blocker")
        if host_blocker.is_blocked(url):
            log.downloads.debug("Skipping {}, host-blocked".format(url))
            # We still need an empty file in the output, QWebView can be pretty
            # picky about displaying a file correctly when not all assets are
            # at least referenced in the mhtml file.
            self.writer.add_file(urlutils.encoded_url(url), b"")
            return

        download_manager = objreg.get("download-manager", scope="window", window=self._win_id)
        item = download_manager.get(url, fileobj=_NoCloseBytesIO(), auto_remove=True)
        self.pending_downloads.add((url, item))
        item.finished.connect(functools.partial(self._finished, url, item))
        item.error.connect(functools.partial(self._error, url, item))
        item.cancelled.connect(functools.partial(self._error, url, item))
Beispiel #3
0
    def _finished(self, url, item):
        """Callback when a single asset is downloaded.

        Args:
            url: The original url of the asset as QUrl.
            item: The DownloadItem given by the DownloadManager
        """
        if self.writer is None:
            raise AssertionError

        self.pending_downloads.remove((url, item))
        mime = item.raw_headers.get(b'Content-Type', b'')

        # Note that this decoding always works and doesn't produce errors
        # RFC 7230 (https://tools.ietf.org/html/rfc7230) states:
        # Historically, HTTP has allowed field content with text in the
        # ISO-8859-1 charset [ISO-8859-1], supporting other charsets only
        # through use of [RFC2047] encoding.  In practice, most HTTP header
        # field values use only a subset of the US-ASCII charset [USASCII].
        # Newly defined header fields SHOULD limit their field values to
        # US-ASCII octets.  A recipient SHOULD treat other octets in field
        # content (obs-text) as opaque data.
        mime = mime.decode('iso-8859-1')

        if mime.lower() == 'text/css' or url.fileName().endswith('.css'):
            # We can't always assume that CSS files are UTF-8, but CSS files
            # shouldn't contain many non-ASCII characters anyway (in most
            # cases). Using "ignore" lets us decode the file even if it's
            # invalid UTF-8 data.
            # The file written to the MHTML file won't be modified by this
            # decoding, since there we're taking the original bytestream.
            try:
                css_string = item.fileobj.getvalue().decode('utf-8')
            except UnicodeDecodeError:
                log.downloads.warning("Invalid UTF-8 data in {}".format(url))
                css_string = item.fileobj.getvalue().decode('utf-8', 'ignore')
            import_urls = _get_css_imports(css_string)
            for import_url in import_urls:
                absolute_url = url.resolved(QUrl(import_url))
                self._fetch_url(absolute_url)

        encode = E_QUOPRI if mime.startswith('text/') else E_BASE64
        # Our MHTML handler refuses non-ASCII headers. This will replace every
        # non-ASCII char with '?'. This is probably okay, as official Content-
        # Type headers contain ASCII only anyway. Anything else is madness.
        mime = utils.force_encoding(mime, 'ascii')
        self.writer.add_file(urlutils.encoded_url(url),
                             item.fileobj.getvalue(), mime, encode)
        item.fileobj.actual_close()
        if self.pending_downloads:
            return
        self._finish_file()
Beispiel #4
0
    def _finished(self, url, item):
        """Callback when a single asset is downloaded.

        Args:
            url: The original url of the asset as QUrl.
            item: The DownloadItem given by the DownloadManager
        """
        self.pending_downloads.remove((url, item))
        mime = item.raw_headers.get(b'Content-Type', b'')

        # Note that this decoding always works and doesn't produce errors
        # RFC 7230 (https://tools.ietf.org/html/rfc7230) states:
        # Historically, HTTP has allowed field content with text in the
        # ISO-8859-1 charset [ISO-8859-1], supporting other charsets only
        # through use of [RFC2047] encoding.  In practice, most HTTP header
        # field values use only a subset of the US-ASCII charset [USASCII].
        # Newly defined header fields SHOULD limit their field values to
        # US-ASCII octets.  A recipient SHOULD treat other octets in field
        # content (obs-text) as opaque data.
        mime = mime.decode('iso-8859-1')

        if mime.lower() == 'text/css' or url.fileName().endswith('.css'):
            # We can't always assume that CSS files are UTF-8, but CSS files
            # shouldn't contain many non-ASCII characters anyway (in most
            # cases). Using "ignore" lets us decode the file even if it's
            # invalid UTF-8 data.
            # The file written to the MHTML file won't be modified by this
            # decoding, since there we're taking the original bytestream.
            try:
                css_string = item.fileobj.getvalue().decode('utf-8')
            except UnicodeDecodeError:
                log.downloads.warning("Invalid UTF-8 data in {}".format(url))
                css_string = item.fileobj.getvalue().decode('utf-8', 'ignore')
            import_urls = _get_css_imports(css_string)
            for import_url in import_urls:
                absolute_url = url.resolved(QUrl(import_url))
                self._fetch_url(absolute_url)

        encode = E_QUOPRI if mime.startswith('text/') else E_BASE64
        # Our MHTML handler refuses non-ASCII headers. This will replace every
        # non-ASCII char with '?'. This is probably okay, as official Content-
        # Type headers contain ASCII only anyway. Anything else is madness.
        mime = utils.force_encoding(mime, 'ascii')
        self.writer.add_file(urlutils.encoded_url(url),
                             item.fileobj.getvalue(), mime, encode)
        item.fileobj.actual_close()
        if self.pending_downloads:
            return
        self._finish_file()
Beispiel #5
0
    def _error(self, url, item, *_args):
        """Callback when a download error occurred.

        Args:
            url: The original url of the asset as QUrl.
            item: The DownloadItem given by the DownloadManager.
        """
        try:
            self.pending_downloads.remove((url, item))
        except KeyError:
            # This might happen if .collect_zombies() calls .finished() and the
            # error handler will be called after .collect_zombies
            log.downloads.debug("Oops! Download already gone: {}".format(item))
            return
        item.fileobj.actual_close()
        # Add a stub file, see comment in .fetch_url() for more information
        self.writer.add_file(urlutils.encoded_url(url), b'')
        if self.pending_downloads:
            return
        self._finish_file()
Beispiel #6
0
    def _error(self, url, item, *_args):
        """Callback when a download error occurred.

        Args:
            url: The orignal url of the asset as QUrl.
            item: The DownloadItem given by the DownloadManager.
        """
        try:
            self.pending_downloads.remove((url, item))
        except KeyError:
            # This might happen if .collect_zombies() calls .finished() and the
            # error handler will be called after .collect_zombies
            log.downloads.debug("Oops! Download already gone: {}".format(item))
            return
        item.fileobj.actual_close()
        # Add a stub file, see comment in .fetch_url() for more information
        self.writer.add_file(urlutils.encoded_url(url), b'')
        if self.pending_downloads:
            return
        self._finish_file()
Beispiel #7
0
    def _fetch_url(self, url):
        """Download the given url and add the file to the collection.

        Args:
            url: The file to download as QUrl.
        """
        if url.scheme() not in ['http', 'https']:
            return
        # Prevent loading an asset twice
        if url in self.loaded_urls:
            return
        self.loaded_urls.add(url)

        log.downloads.debug("loading asset at {}".format(url))

        # Using the download manager to download host-blocked urls might crash
        # qute, see the comments/discussion on
        # https://github.com/qutebrowser/qutebrowser/pull/962#discussion_r40256987
        # and https://github.com/qutebrowser/qutebrowser/issues/1053
        request = interceptors.Request(first_party_url=None, request_url=url)
        interceptors.run(request)
        if request.is_blocked:
            log.downloads.debug("Skipping {}, host-blocked".format(url))
            # We still need an empty file in the output, QWebView can be pretty
            # picky about displaying a file correctly when not all assets are
            # at least referenced in the mhtml file.
            self.writer.add_file(urlutils.encoded_url(url), b'')
            return

        download_manager = objreg.get('qtnetwork-download-manager')
        target = downloads.FileObjDownloadTarget(_NoCloseBytesIO())
        item = download_manager.get(url, target=target,
                                    auto_remove=True)
        self.pending_downloads.add((url, item))
        item.finished.connect(functools.partial(self._finished, url, item))
        item.error.connect(functools.partial(self._error, url, item))
        item.cancelled.connect(functools.partial(self._cancelled, url, item))
def test_encoded_url(url, expected):
    """Test encoded_url"""
    url = QUrl(url)
    assert urlutils.encoded_url(url) == expected
Beispiel #9
0
    def run(self):
        """Download and save the page.

        The object must not be reused, you should create a new one if
        you want to download another page.
        """
        if self._used:
            raise ValueError("Downloader already used")
        self._used = True
        web_url = self.web_view.url()
        web_frame = self.web_view.page().mainFrame()

        self.writer = MHTMLWriter(
            web_frame.toHtml().encode('utf-8'),
            content_location=urlutils.encoded_url(web_url),
            # I've found no way of getting the content type of a QWebView, but
            # since we're using .toHtml, it's probably safe to say that the
            # content-type is HTML
            content_type='text/html; charset="UTF-8"',
        )
        # Currently only downloading <link> (stylesheets), <script>
        # (javascript) and <img> (image) elements.
        elements = web_frame.findAllElements('link, script, img')

        for element in elements:
            element = webelem.WebElementWrapper(element)
            # Websites are free to set whatever rel=... attribute they want.
            # We just care about stylesheets and icons.
            if not _check_rel(element):
                continue
            if 'src' in element:
                element_url = element['src']
            elif 'href' in element:
                element_url = element['href']
            else:
                # Might be a local <script> tag or something else
                continue
            absolute_url = web_url.resolved(QUrl(element_url))
            self._fetch_url(absolute_url)

        styles = web_frame.findAllElements('style')
        for style in styles:
            style = webelem.WebElementWrapper(style)
            # The Mozilla Developer Network says:
            # type: This attribute defines the styling language as a MIME type
            # (charset should not be specified). This attribute is optional and
            # default to text/css if it's missing.
            # https://developer.mozilla.org/en/docs/Web/HTML/Element/style
            if 'type' in style and style['type'] != 'text/css':
                continue
            for element_url in _get_css_imports(str(style)):
                self._fetch_url(web_url.resolved(QUrl(element_url)))

        # Search for references in inline styles
        for element in web_frame.findAllElements('[style]'):
            element = webelem.WebElementWrapper(element)
            style = element['style']
            for element_url in _get_css_imports(style, inline=True):
                self._fetch_url(web_url.resolved(QUrl(element_url)))

        # Shortcut if no assets need to be downloaded, otherwise the file would
        # never be saved. Also might happen if the downloads are fast enough to
        # complete before connecting their finished signal.
        self._collect_zombies()
        if not self.pending_downloads and not self._finished_file:
            self._finish_file()
Beispiel #10
0
    def run(self):
        """Download and save the page.

        The object must not be reused, you should create a new one if
        you want to download another page.
        """
        if self._used:
            raise ValueError("Downloader already used")
        self._used = True
        web_url = self.tab.url()

        # FIXME:qtwebengine have a proper API for this
        page = self.tab._widget.page()  # pylint: disable=protected-access
        web_frame = page.mainFrame()

        self.writer = MHTMLWriter(
            web_frame.toHtml().encode('utf-8'),
            content_location=urlutils.encoded_url(web_url),
            # I've found no way of getting the content type of a QWebView, but
            # since we're using .toHtml, it's probably safe to say that the
            # content-type is HTML
            content_type='text/html; charset="UTF-8"',
        )
        # Currently only downloading <link> (stylesheets), <script>
        # (javascript) and <img> (image) elements.
        elements = web_frame.findAllElements('link, script, img')

        for element in elements:
            element = webkitelem.WebKitElement(element, tab=self.tab)
            # Websites are free to set whatever rel=... attribute they want.
            # We just care about stylesheets and icons.
            if not _check_rel(element):
                continue
            if 'src' in element:
                element_url = element['src']
            elif 'href' in element:
                element_url = element['href']
            else:
                # Might be a local <script> tag or something else
                continue
            absolute_url = web_url.resolved(QUrl(element_url))
            self._fetch_url(absolute_url)

        styles = web_frame.findAllElements('style')
        for style in styles:
            style = webkitelem.WebKitElement(style, tab=self.tab)
            # The Mozilla Developer Network says:
            # > type: This attribute defines the styling language as a MIME
            # > type (charset should not be specified). This attribute is
            # > optional and default to text/css if it's missing.
            # https://developer.mozilla.org/en/docs/Web/HTML/Element/style
            if 'type' in style and style['type'] != 'text/css':
                continue
            for element_url in _get_css_imports(str(style)):
                self._fetch_url(web_url.resolved(QUrl(element_url)))

        # Search for references in inline styles
        for element in web_frame.findAllElements('[style]'):
            element = webkitelem.WebKitElement(element, tab=self.tab)
            style = element['style']
            for element_url in _get_css_imports(style, inline=True):
                self._fetch_url(web_url.resolved(QUrl(element_url)))

        # Shortcut if no assets need to be downloaded, otherwise the file would
        # never be saved. Also might happen if the downloads are fast enough to
        # complete before connecting their finished signal.
        self._collect_zombies()
        if not self.pending_downloads and not self._finished_file:
            self._finish_file()
Beispiel #11
0
def test_encoded_url(url, expected):
    url = QUrl(url)
    assert urlutils.encoded_url(url) == expected