Exemple #1
0
 def normalize(self):
     parsed = urllib_parse.urlsplit(self.url)
     if parsed.scheme == 'file':
         path = urllib_request.url2pathname(parsed.path)
         path = normalize_path(path)
         path = urllib_request.pathname2url(path)
         self.url = urllib_parse.urlunsplit(
             (parsed.scheme, parsed.netloc,
              path, parsed.query, parsed.fragment))
Exemple #2
0
def _get_html_page(link, session=None):
    # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'"
        )

    url = link.url.split('#', 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    vcs_scheme = _match_vcs_scheme(url)
    if vcs_scheme:
        logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
        return None

    # Tack index.html onto file:// URLs that point to directories
    scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
    if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
        # add trailing slash if not present so urljoin doesn't trim
        # final segment
        if not url.endswith('/'):
            url += '/'
        url = urllib_parse.urljoin(url, 'index.html')
        logger.debug(' file: URL is directory, getting %s', url)

    try:
        resp = _get_html_response(url, session=session)
    except _NotHTTP as exc:
        logger.debug(
            'Skipping page %s because it looks like an archive, and cannot '
            'be checked by HEAD.', link,
        )
    except _NotHTML as exc:
        logger.debug(
            'Skipping page %s because the %s request got Content-Type: %s',
            link, exc.request_desc, exc.content_type,
        )
    except requests.HTTPError as exc:
        _handle_get_page_fail(link, exc)
    except RetryError as exc:
        _handle_get_page_fail(link, exc)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, "connection error: %s" % exc)
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out")
    else:
        return HTMLPage(resp.content, resp.url, resp.headers)
    return None
Exemple #3
0
def url_to_path(url):
    """
    Convert a file: URL to a path.
    """
    assert url.startswith("file:"), "You can only turn file: urls into filenames (not %r)" % url

    _, netloc, path, _, _ = urllib_parse.urlsplit(url)

    # if we have a UNC path, prepend UNC share notation
    if netloc:
        netloc = "\\\\" + netloc

    path = urllib_request.url2pathname(netloc + path)
    return path
Exemple #4
0
def url_to_path(url):
    # type: (str) -> str
    """
    Convert a file: URL to a path.
    """
    assert url.startswith('file:'), (
        "You can only turn file: urls into filenames (not %r)" % url)

    _, netloc, path, _, _ = urllib_parse.urlsplit(url)

    # if we have a UNC path, prepend UNC share notation
    if netloc:
        netloc = '\\\\' + netloc

    path = urllib_request.url2pathname(netloc + path)
    return path
Exemple #5
0
    def __init__(self, url=None, *args, **kwargs):

        # Works around an apparent Git bug
        # (see https://article.gmane.org/gmane.comp.version-control.git/146500)
        if url:
            scheme, netloc, path, query, fragment = urlsplit(url)
            if scheme.endswith('file'):
                initial_slashes = path[:-len(path.lstrip('/'))]
                newpath = (initial_slashes +
                           urllib_request.url2pathname(path).replace(
                               '\\', '/').lstrip('/'))
                url = urlunsplit((scheme, netloc, newpath, query, fragment))
                after_plus = scheme.find('+') + 1
                url = scheme[:after_plus] + urlunsplit(
                    (scheme[after_plus:], netloc, newpath, query, fragment), )

        super(Git, self).__init__(url, *args, **kwargs)
Exemple #6
0
 def get_url_rev_and_auth(cls, url):
     # type: (str) -> Tuple[str, Optional[str], AuthInfo]
     """
     Prefixes stub URLs like 'user@hostname:user/repo.git' with 'ssh://'.
     That's required because although they use SSH they sometimes don't
     work with a ssh:// scheme (e.g. GitHub). But we need a scheme for
     parsing. Hence we remove it again afterwards and return it as a stub.
     """
     # Works around an apparent Git bug
     # (see https://article.gmane.org/gmane.comp.version-control.git/146500)
     scheme, netloc, path, query, fragment = urlsplit(url)
     if scheme.endswith('file'):
         initial_slashes = path[:-len(path.lstrip('/'))]
         newpath = (
             initial_slashes +
             urllib_request.url2pathname(path)
             .replace('\\', '/').lstrip('/')
         )
Exemple #7
0
    def __init__(self, url=None, *args, **kwargs):

        # Works around an apparent Git bug
        # (see http://article.gmane.org/gmane.comp.version-control.git/146500)
        if url:
            scheme, netloc, path, query, fragment = urlsplit(url)
            if scheme.endswith('file'):
                initial_slashes = path[:-len(path.lstrip('/'))]
                newpath = (
                    initial_slashes +
                    urllib_request.url2pathname(path)
                    .replace('\\', '/').lstrip('/')
                )
                url = urlunsplit((scheme, netloc, newpath, query, fragment))
                after_plus = scheme.find('+') + 1
                url = scheme[:after_plus] + urlunsplit(
                    (scheme[after_plus:], netloc, newpath, query, fragment),
                )

        super(Git, self).__init__(url, *args, **kwargs)
Exemple #8
0
def _clean_link(url):
    # type: (str) -> str
    """Makes sure a link is fully encoded.  That is, if a ' ' shows up in
    the link, it will be rewritten to %20 (while not over-quoting
    % or other characters)."""
    # Split the URL into parts according to the general structure
    # `scheme://netloc/path;parameters?query#fragment`. Note that the
    # `netloc` can be empty and the URI will then refer to a local
    # filesystem path.
    result = urllib_parse.urlparse(url)
    # In both cases below we unquote prior to quoting to make sure
    # nothing is double quoted.
    if result.netloc == "":
        # On Windows the path part might contain a drive letter which
        # should not be quoted. On Linux where drive letters do not
        # exist, the colon should be quoted. We rely on urllib.request
        # to do the right thing here.
        path = urllib_request.pathname2url(
            urllib_request.url2pathname(result.path))
    else:
        path = urllib_parse.quote(urllib_parse.unquote(result.path))
    return urllib_parse.urlunparse(result._replace(path=path))
Exemple #9
0
def _clean_link(url):
    # type: (str) -> str
    """Makes sure a link is fully encoded.  That is, if a ' ' shows up in
    the link, it will be rewritten to %20 (while not over-quoting
    % or other characters)."""
    # Split the URL into parts according to the general structure
    # `scheme://netloc/path;parameters?query#fragment`. Note that the
    # `netloc` can be empty and the URI will then refer to a local
    # filesystem path.
    result = urllib_parse.urlparse(url)
    # In both cases below we unquote prior to quoting to make sure
    # nothing is double quoted.
    if result.netloc == "":
        # On Windows the path part might contain a drive letter which
        # should not be quoted. On Linux where drive letters do not
        # exist, the colon should be quoted. We rely on urllib.request
        # to do the right thing here.
        path = urllib_request.pathname2url(
            urllib_request.url2pathname(result.path))
    else:
        path = urllib_parse.quote(urllib_parse.unquote(result.path))
    return urllib_parse.urlunparse(result._replace(path=path))
Exemple #10
0
def url_to_path(url):
    # type: (str) -> str
    """
    Convert a file: URL to a path.
    """
    assert url.startswith("file:"), (
        "You can only turn file: urls into filenames (not %r)" % url)

    _, netloc, path, _, _ = urllib_parse.urlsplit(url)

    if not netloc or netloc == "localhost":
        # According to RFC 8089, same as empty authority.
        netloc = ""
    elif sys.platform == "win32":
        # If we have a UNC path, prepend UNC share notation.
        netloc = "\\\\" + netloc
    else:
        raise ValueError(
            "non-local file URIs are not supported on this platform: %r" % url)

    path = urllib_request.url2pathname(netloc + path)
    return path
Exemple #11
0
def url_to_path(url):
    # type: (str) -> str
    """
    Convert a file: URL to a path.
    """
    assert url.startswith('file:'), (
        "You can only turn file: urls into filenames (not {url!r})".format(
            **locals()))

    _, netloc, path, _, _ = urllib_parse.urlsplit(url)

    if not netloc or netloc == 'localhost':
        # According to RFC 8089, same as empty authority.
        netloc = ''
    elif sys.platform == 'win32':
        # If we have a UNC path, prepend UNC share notation.
        netloc = '\\\\' + netloc
    else:
        raise ValueError(
            'non-local file URIs are not supported on this platform: {url!r}'.
            format(**locals()))

    path = urllib_request.url2pathname(netloc + path)
    return path
def url_to_path(url):
    # type: (str) -> str
    """
    Convert a file: URL to a path.
    """
    assert url.startswith('file:'), (
        "You can only turn file: urls into filenames (not %r)" % url)

    _, netloc, path, _, _ = urllib_parse.urlsplit(url)

    if not netloc or netloc == 'localhost':
        # According to RFC 8089, same as empty authority.
        netloc = ''
    elif sys.platform == 'win32':
        # If we have a UNC path, prepend UNC share notation.
        netloc = '\\\\' + netloc
    else:
        raise ValueError(
            'non-local file URIs are not supported on this platform: %r'
            % url
        )

    path = urllib_request.url2pathname(netloc + path)
    return path
Exemple #13
0
    def get_page(cls, link, skip_archives=True, session=None):
        if session is None:
            raise TypeError(
                "get_page() missing 1 required keyword argument: 'session'")

        url = link.url
        url = url.split('#', 1)[0]

        # Check for VCS schemes that do not support lookup as web pages.
        from pip.vcs import VcsSupport
        for scheme in VcsSupport.schemes:
            if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
                logger.debug('Cannot look at %s URL %s', scheme, link)
                return None

        try:
            if skip_archives:
                filename = link.filename
                for bad_ext in ARCHIVE_EXTENSIONS:
                    if filename.endswith(bad_ext):
                        content_type = cls._get_content_type(
                            url,
                            session=session,
                        )
                        if content_type.lower().startswith('text/html'):
                            break
                        else:
                            logger.debug(
                                'Skipping page %s because of Content-Type: %s',
                                link,
                                content_type,
                            )
                            return

            logger.debug('Getting page %s', url)

            # Tack index.html onto file:// URLs that point to directories
            (scheme, netloc, path, params, query, fragment) = \
                urllib_parse.urlparse(url)
            if (scheme == 'file'
                    and os.path.isdir(urllib_request.url2pathname(path))):
                # add trailing slash if not present so urljoin doesn't trim
                # final segment
                if not url.endswith('/'):
                    url += '/'
                url = urllib_parse.urljoin(url, 'index.html')
                logger.debug(' file: URL is directory, getting %s', url)

            resp = session.get(
                url,
                headers={
                    "Accept": "text/html",
                    "Cache-Control": "max-age=600",
                },
            )
            resp.raise_for_status()

            # The check for archives above only works if the url ends with
            # something that looks like an archive. However that is not a
            # requirement of an url. Unless we issue a HEAD request on every
            # url we cannot know ahead of time for sure if something is HTML
            # or not. However we can check after we've downloaded it.
            content_type = resp.headers.get('Content-Type', 'unknown')
            if not content_type.lower().startswith("text/html"):
                logger.debug(
                    'Skipping page %s because of Content-Type: %s',
                    link,
                    content_type,
                )
                return

            inst = cls(resp.content, resp.url, resp.headers)
        except requests.HTTPError as exc:
            cls._handle_fail(link, exc, url)
        except SSLError as exc:
            reason = ("There was a problem confirming the ssl certificate: "
                      "%s" % exc)
            cls._handle_fail(link, reason, url, meth=logger.info)
        except requests.ConnectionError as exc:
            cls._handle_fail(link, "connection error: %s" % exc, url)
        except requests.Timeout:
            cls._handle_fail(link, "timed out", url)
        else:
            return inst
Exemple #14
0
    url = link.url.split('#', 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    vcs_scheme = _match_vcs_scheme(url)
    if vcs_scheme:
<<<<<<< HEAD
        logger.warning('Cannot look at %s URL %s because it does not support '
                       'lookup as web pages.', vcs_scheme, link)
=======
        logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
>>>>>>> b66a76afa15ab74019740676a52a071b85ed8f71
        return None

    # Tack index.html onto file:// URLs that point to directories
    scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
    if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
        # add trailing slash if not present so urljoin doesn't trim
        # final segment
        if not url.endswith('/'):
            url += '/'
        url = urllib_parse.urljoin(url, 'index.html')
        logger.debug(' file: URL is directory, getting %s', url)

    try:
        resp = _get_html_response(url, session=session)
    except _NotHTTP:
<<<<<<< HEAD
        logger.warning(
            'Skipping page %s because it looks like an archive, and cannot '
            'be checked by a HTTP HEAD request.', link,
        )
Exemple #15
0
def _get_html_page(link, session=None):
    # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'")

    url = link.url.split("#", 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    vcs_scheme = _match_vcs_scheme(url)
    if vcs_scheme:
        logger.warning(
            "Cannot look at %s URL %s because it does not support "
            "lookup as web pages.",
            vcs_scheme,
            link,
        )
        return None

    # Tack index.html onto file:// URLs that point to directories
    scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
    if scheme == "file" and os.path.isdir(urllib_request.url2pathname(path)):
        # add trailing slash if not present so urljoin doesn't trim
        # final segment
        if not url.endswith("/"):
            url += "/"
        url = urllib_parse.urljoin(url, "index.html")
        logger.debug(" file: URL is directory, getting %s", url)

    try:
        resp = _get_html_response(url, session=session)
    except _NotHTTP:
        logger.warning(
            "Skipping page %s because it looks like an archive, and cannot "
            "be checked by a HTTP HEAD request.",
            link,
        )
    except _NotHTML as exc:
        logger.warning(
            "Skipping page %s because the %s request got Content-Type: %s."
            "The only supported Content-Type is text/html",
            link,
            exc.request_desc,
            exc.content_type,
        )
    except NetworkConnectionError as exc:
        _handle_get_page_fail(link, exc)
    except RetryError as exc:
        _handle_get_page_fail(link, exc)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, "connection error: {}".format(exc))
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out")
    else:
        return _make_html_page(resp,
                               cache_link_parsing=link.cache_link_parsing)
    return None
Exemple #16
0
    def get_page(cls, link, skip_archives=True, session=None):
        if session is None:
            raise TypeError(
                "get_page() missing 1 required keyword argument: 'session'"
            )

        url = link.url
        url = url.split('#', 1)[0]

        # Check for VCS schemes that do not support lookup as web pages.
        from pip._internal.vcs import VcsSupport
        for scheme in VcsSupport.schemes:
            if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
                logger.debug('Cannot look at %s URL %s', scheme, link)
                return None

        try:
            if skip_archives:
                filename = link.filename
                for bad_ext in ARCHIVE_EXTENSIONS:
                    if filename.endswith(bad_ext):
                        content_type = cls._get_content_type(
                            url, session=session,
                        )
                        if content_type.lower().startswith('text/html'):
                            break
                        else:
                            logger.debug(
                                'Skipping page %s because of Content-Type: %s',
                                link,
                                content_type,
                            )
                            return

            logger.debug('Getting page %s', url)

            # Tack index.html onto file:// URLs that point to directories
            (scheme, netloc, path, params, query, fragment) = \
                urllib_parse.urlparse(url)
            if (scheme == 'file' and
                    os.path.isdir(urllib_request.url2pathname(path))):
                # add trailing slash if not present so urljoin doesn't trim
                # final segment
                if not url.endswith('/'):
                    url += '/'
                url = urllib_parse.urljoin(url, 'index.html')
                logger.debug(' file: URL is directory, getting %s', url)

            resp = session.get(
                url,
                headers={
                    "Accept": "text/html",
                    "Cache-Control": "max-age=600",
                },
            )
            resp.raise_for_status()

            # The check for archives above only works if the url ends with
            # something that looks like an archive. However that is not a
            # requirement of an url. Unless we issue a HEAD request on every
            # url we cannot know ahead of time for sure if something is HTML
            # or not. However we can check after we've downloaded it.
            content_type = resp.headers.get('Content-Type', 'unknown')
            if not content_type.lower().startswith("text/html"):
                logger.debug(
                    'Skipping page %s because of Content-Type: %s',
                    link,
                    content_type,
                )
                return

            inst = cls(resp.content, resp.url, resp.headers)
        except requests.HTTPError as exc:
            cls._handle_fail(link, exc, url)
        except SSLError as exc:
            reason = "There was a problem confirming the ssl certificate: "
            reason += str(exc)
            cls._handle_fail(link, reason, url, meth=logger.info)
        except requests.ConnectionError as exc:
            cls._handle_fail(link, "connection error: %s" % exc, url)
        except requests.Timeout:
            cls._handle_fail(link, "timed out", url)
        else:
            return inst
Exemple #17
0
def _get_html_page(link, session=None):
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'")

    url = link.url
    url = url.split('#', 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    from pip._internal.vcs import VcsSupport
    for scheme in VcsSupport.schemes:
        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
            logger.debug('Cannot look at %s URL %s', scheme, link)
            return None

    try:
        filename = link.filename
        for bad_ext in ARCHIVE_EXTENSIONS:
            if filename.endswith(bad_ext):
                content_type = _get_content_type(url, session=session)
                if content_type.lower().startswith('text/html'):
                    break
                else:
                    logger.debug(
                        'Skipping page %s because of Content-Type: %s',
                        link,
                        content_type,
                    )
                    return

        logger.debug('Getting page %s', url)

        # Tack blank.html onto file:// URLs that point to directories
        (scheme, netloc, path, params, query, fragment) = \
            urllib_parse.urlparse(url)
        if (scheme == 'file'
                and os.path.isdir(urllib_request.url2pathname(path))):
            # add trailing slash if not present so urljoin doesn't trim
            # final segment
            if not url.endswith('/'):
                url += '/'
            url = urllib_parse.urljoin(url, 'blank.html')
            logger.debug(' file: URL is directory, getting %s', url)

        resp = session.get(
            url,
            headers={
                "Accept": "text/html",
                # We don't want to blindly returned cached data for
                # /simple/, because authors generally expecting that
                # twine upload && pip install will function, but if
                # they've done a pip install in the last ~10 minutes
                # it won't. Thus by setting this to zero we will not
                # blindly use any cached data, however the benefit of
                # using max-age=0 instead of no-cache, is that we will
                # still support conditional requests, so we will still
                # minimize traffic sent in cases where the page hasn't
                # changed at all, we will just always incur the round
                # trip for the conditional GET now instead of only
                # once per 10 minutes.
                # For more information, please see pypa/pip#5670.
                "Cache-Control": "max-age=0",
            },
        )
        resp.raise_for_status()

        # The check for archives above only works if the url ends with
        # something that looks like an archive. However that is not a
        # requirement of an url. Unless we issue a HEAD request on every
        # url we cannot know ahead of time for sure if something is HTML
        # or not. However we can check after we've downloaded it.
        content_type = resp.headers.get('Content-Type', 'unknown')
        if not content_type.lower().startswith("text/html"):
            logger.debug(
                'Skipping page %s because of Content-Type: %s',
                link,
                content_type,
            )
            return

        inst = HTMLPage(resp.content, resp.url, resp.headers)
    except HTTPError as exc:
        _handle_get_page_fail(link, exc, url)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, url, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, "connection error: %s" % exc, url)
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out", url)
    else:
        return inst
Exemple #18
0
    def get_page(cls, link, skip_archives=True, session=None):
        if session is None:
            raise TypeError("get_page() missing 1 required keyword argument: 'session'")

        url = link.url
        url = url.split("#", 1)[0]

        # Check for VCS schemes that do not support lookup as web pages.
        from pip.vcs import VcsSupport

        for scheme in VcsSupport.schemes:
            if url.lower().startswith(scheme) and url[len(scheme)] in "+:":
                logger.debug("Cannot look at %s URL %s", scheme, link)
                return None

        try:
            if skip_archives:
                filename = link.filename
                for bad_ext in [".tar", ".tar.gz", ".tar.bz2", ".tgz", ".zip"]:
                    if filename.endswith(bad_ext):
                        content_type = cls._get_content_type(url, session=session)
                        if content_type.lower().startswith("text/html"):
                            break
                        else:
                            logger.debug("Skipping page %s because of Content-Type: %s", link, content_type)
                            return

            logger.debug("Getting page %s", url)

            # Tack index.html onto file:// URLs that point to directories
            (scheme, netloc, path, params, query, fragment) = urllib_parse.urlparse(url)
            if scheme == "file" and os.path.isdir(urllib_request.url2pathname(path)):
                # add trailing slash if not present so urljoin doesn't trim
                # final segment
                if not url.endswith("/"):
                    url += "/"
                url = urllib_parse.urljoin(url, "index.html")
                logger.debug(" file: URL is directory, getting %s", url)

            resp = session.get(url, headers={"Accept": "text/html", "Cache-Control": "max-age=600"})
            resp.raise_for_status()

            # The check for archives above only works if the url ends with
            #   something that looks like an archive. However that is not a
            #   requirement of an url. Unless we issue a HEAD request on every
            #   url we cannot know ahead of time for sure if something is HTML
            #   or not. However we can check after we've downloaded it.
            content_type = resp.headers.get("Content-Type", "unknown")
            if not content_type.lower().startswith("text/html"):
                logger.debug("Skipping page %s because of Content-Type: %s", link, content_type)
                return

            inst = cls(resp.content, resp.url, resp.headers, trusted=link.trusted)
        except requests.HTTPError as exc:
            level = 2 if exc.response.status_code == 404 else 1
            cls._handle_fail(link, exc, url, level=level)
        except requests.ConnectionError as exc:
            cls._handle_fail(link, "connection error: %s" % exc, url)
        except requests.Timeout:
            cls._handle_fail(link, "timed out", url)
        except SSLError as exc:
            reason = "There was a problem confirming the ssl certificate: " "%s" % exc
            cls._handle_fail(link, reason, url, level=2, meth=logger.info)
        else:
            return inst
Exemple #19
0
def _get_html_page(link, session=None):
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'"
        )

    url = link.url
    url = url.split('#', 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    from pip._internal.vcs import VcsSupport
    for scheme in VcsSupport.schemes:
        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
            logger.debug('Cannot look at %s URL %s', scheme, link)
            return None

    try:
        filename = link.filename
        for bad_ext in ARCHIVE_EXTENSIONS:
            if filename.endswith(bad_ext):
                content_type = _get_content_type(url, session=session)
                if content_type.lower().startswith('text/html'):
                    break
                else:
                    logger.debug(
                        'Skipping page %s because of Content-Type: %s',
                        link,
                        content_type,
                    )
                    return

        logger.debug('Getting page %s', url)

        # Tack index.html onto file:// URLs that point to directories
        (scheme, netloc, path, params, query, fragment) = \
            urllib_parse.urlparse(url)
        if (scheme == 'file' and
                os.path.isdir(urllib_request.url2pathname(path))):
            # add trailing slash if not present so urljoin doesn't trim
            # final segment
            if not url.endswith('/'):
                url += '/'
            url = urllib_parse.urljoin(url, 'index.html')
            logger.debug(' file: URL is directory, getting %s', url)

        resp = session.get(
            url,
            headers={
                "Accept": "text/html",
                # We don't want to blindly returned cached data for
                # /simple/, because authors generally expecting that
                # twine upload && pip install will function, but if
                # they've done a pip install in the last ~10 minutes
                # it won't. Thus by setting this to zero we will not
                # blindly use any cached data, however the benefit of
                # using max-age=0 instead of no-cache, is that we will
                # still support conditional requests, so we will still
                # minimize traffic sent in cases where the page hasn't
                # changed at all, we will just always incur the round
                # trip for the conditional GET now instead of only
                # once per 10 minutes.
                # For more information, please see pypa/pip#5670.
                "Cache-Control": "max-age=0",
            },
        )
        resp.raise_for_status()

        # The check for archives above only works if the url ends with
        # something that looks like an archive. However that is not a
        # requirement of an url. Unless we issue a HEAD request on every
        # url we cannot know ahead of time for sure if something is HTML
        # or not. However we can check after we've downloaded it.
        content_type = resp.headers.get('Content-Type', 'unknown')
        if not content_type.lower().startswith("text/html"):
            logger.debug(
                'Skipping page %s because of Content-Type: %s',
                link,
                content_type,
            )
            return

        inst = HTMLPage(resp.content, resp.url, resp.headers)
    except requests.HTTPError as exc:
        _handle_get_page_fail(link, exc, url)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, url, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, "connection error: %s" % exc, url)
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out", url)
    else:
        return inst
Exemple #20
0

def url_to_path(url):
    """
    Convert a file: URL to a path.
    """
    assert url.startswith('file:'), (
        "You can only turn file: urls into filenames (not %r)" % url)

    _, netloc, path, _, _ = urllib_parse.urlsplit(url)

    # if we have a UNC path, prepend UNC share notation
    if netloc:
        netloc = '\\\\' + netloc

    path = urllib_request.url2pathname(netloc + path)
    return path


def path_to_url(path):
    """
    Convert a path to a file: URL.  The path will be made absolute and have
    quoted path parts.
    """
    path = os.path.normpath(os.path.abspath(path))
    url = urllib_parse.urljoin('file:', urllib_request.pathname2url(path))
    return url


def is_archive_file(name):
    """Return True if `name` is a considered as an archive file."""