Esempio n. 1
0
def _create_link_from_element(
        anchor,  # type: HTMLElement
        page_url,  # type: str
        base_url,  # type: str
):
    # type: (...) -> Optional[Link]
    """
    Convert an anchor element in a simple repository page to a Link.
    """
    href = anchor.get("href")
    if not href:
        return None

    url = _clean_link(urllib_parse.urljoin(base_url, href))
    pyrequire = anchor.get('data-requires-python')
    pyrequire = unescape(pyrequire) if pyrequire else None

    yanked_reason = anchor.get('data-yanked')
    if yanked_reason:
        # This is a unicode string in Python 2 (and 3).
        yanked_reason = unescape(yanked_reason)

    link = Link(
        url,
        comes_from=page_url,
        requires_python=pyrequire,
        yanked_reason=yanked_reason,
    )

    return link
Esempio n. 2
0
    def _parse_and_recurse(self, filename, constraint):
        # type: (str, bool) -> Iterator[ParsedLine]
        for line in self._parse_file(filename, constraint):
            if (not line.args and not line.opts.editables
                    and (line.opts.requirements or line.opts.constraints)):
                # parse a nested requirements file
                if line.opts.requirements:
                    req_path = line.opts.requirements[0]
                    nested_constraint = False
                else:
                    req_path = line.opts.constraints[0]
                    nested_constraint = True

                # original file is over http
                if SCHEME_RE.search(filename):
                    # do a url join so relative paths work
                    req_path = urllib_parse.urljoin(filename, req_path)
                # original file and nested file are paths
                elif not SCHEME_RE.search(req_path):
                    # do a join so relative paths work
                    req_path = os.path.join(
                        os.path.dirname(filename),
                        req_path,
                    )

                for inner_line in self._parse_and_recurse(
                        req_path,
                        nested_constraint,
                ):
                    yield inner_line
            else:
                yield line
Esempio n. 3
0
def path_to_url(path):
    """
    Convert a path to a file: URL.  The path will be made absolute and have
    quoted path parts.
    """
    path = os.path.normpath(os.path.abspath(path))
    url = urllib_parse.urljoin('file:', urllib_request.pathname2url(path))
    return url
Esempio n. 4
0
def path_to_url(path):
    """
    Convert a path to a file: URL.  The path will be made absolute and have
    quoted path parts.
    """
    path = os.path.normpath(os.path.abspath(path))
    url = urllib_parse.urljoin('file:', urllib_request.pathname2url(path))
    return url
Esempio n. 5
0
 def links(self):
     """Yields all links in the page"""
     for anchor in self.parsed.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = self.clean_link(urllib_parse.urljoin(
                 self.base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self, requires_python=pyrequire)
Esempio n. 6
0
 def links(self):
     """Yields all links in the page"""
     for anchor in self.parsed.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = self.clean_link(
                 urllib_parse.urljoin(self.base_url, href)
             )
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self, requires_python=pyrequire)
Esempio n. 7
0
def _get_html_page(link, session=None):
    # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'")

    url = link.url.split('#', 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    vcs_scheme = _match_vcs_scheme(url)
    if vcs_scheme:
        logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
        return None

    # Tack index.html onto file:// URLs that point to directories
    scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
    if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
        # add trailing slash if not present so urljoin doesn't trim
        # final segment
        if not url.endswith('/'):
            url += '/'
        url = urllib_parse.urljoin(url, 'index.html')
        logger.debug(' file: URL is directory, getting %s', url)

    try:
        resp = _get_html_response(url, session=session)
    except _NotHTTP:
        logger.debug(
            'Skipping page %s because it looks like an archive, and cannot '
            'be checked by HEAD.',
            link,
        )
    except _NotHTML as exc:
        logger.debug(
            'Skipping page %s because the %s request got Content-Type: %s',
            link,
            exc.request_desc,
            exc.content_type,
        )
    except HTTPError as exc:
        _handle_get_page_fail(link, exc)
    except RetryError as exc:
        _handle_get_page_fail(link, exc)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, "connection error: %s" % exc)
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out")
    else:
        return _make_html_page(resp)
    return None
Esempio n. 8
0
 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)
Esempio n. 9
0
def process_line(line,
                 filename,
                 line_number,
                 finder=None,
                 comes_from=None,
                 options=None,
                 session=None,
                 wheel_cache=None,
                 constraint=False):
    """Process a single requirements line; This can result in creating/yielding
    requirements, or updating the finder.

    For lines that contain requirements, the only options that have an effect
    are from SUPPORTED_OPTIONS_REQ, and they are scoped to the
    requirement. Other options from SUPPORTED_OPTIONS may be present, but are
    ignored.

    For lines that do not contain requirements, the only options that have an
    effect are from SUPPORTED_OPTIONS. Options from SUPPORTED_OPTIONS_REQ may
    be present, but are ignored. These lines may contain multiple options
    (although our docs imply only one is supported), and all our parsed and
    affect the finder.

    :param constraint: If True, parsing a constraints file.
    :param options: OptionParser options that we may update
    """
    parser = build_parser(line)
    defaults = parser.get_default_values()
    defaults.index_url = None
    if finder:
        # `finder.format_control` will be updated during parsing
        defaults.format_control = finder.format_control
    args_str, options_str = break_args_options(line)
    if sys.version_info < (2, 7, 3):
        # Prior to 2.7.3, shlex cannot deal with unicode entries
        options_str = options_str.encode('utf8')
    opts, _ = parser.parse_args(shlex.split(options_str), defaults)

    # preserve for the nested code path
    line_comes_from = '%s %s (line %s)' % (
        '-c' if constraint else '-r',
        filename,
        line_number,
    )

    # yield a line requirement
    if args_str:
        isolated = options.isolated_mode if options else False
        if options:
            cmdoptions.check_install_build_global(options, opts)
        # get the options that apply to requirements
        req_options = {}
        for dest in SUPPORTED_OPTIONS_REQ_DEST:
            if dest in opts.__dict__ and opts.__dict__[dest]:
                req_options[dest] = opts.__dict__[dest]
        yield InstallRequirement.from_line(args_str,
                                           line_comes_from,
                                           constraint=constraint,
                                           isolated=isolated,
                                           options=req_options,
                                           wheel_cache=wheel_cache)

    # yield an editable requirement
    elif opts.editables:
        isolated = options.isolated_mode if options else False
        yield InstallRequirement.from_editable(opts.editables[0],
                                               comes_from=line_comes_from,
                                               constraint=constraint,
                                               isolated=isolated,
                                               wheel_cache=wheel_cache)

    # parse a nested requirements file
    elif opts.requirements or opts.constraints:
        if opts.requirements:
            req_path = opts.requirements[0]
            nested_constraint = False
        else:
            req_path = opts.constraints[0]
            nested_constraint = True
        # original file is over http
        if SCHEME_RE.search(filename):
            # do a url join so relative paths work
            req_path = urllib_parse.urljoin(filename, req_path)
        # original file and nested file are paths
        elif not SCHEME_RE.search(req_path):
            # do a join so relative paths work
            req_path = os.path.join(os.path.dirname(filename), req_path)
        # TODO: Why not use `comes_from='-r {} (line {})'` here as well?
        parser = parse_requirements(req_path,
                                    finder,
                                    comes_from,
                                    options,
                                    session,
                                    constraint=nested_constraint,
                                    wheel_cache=wheel_cache)
        for req in parser:
            yield req

    # percolate hash-checking option upward
    elif opts.require_hashes:
        options.require_hashes = opts.require_hashes

    # set finder options
    elif finder:
        if opts.index_url:
            finder.index_urls = [opts.index_url]
        if opts.no_index is True:
            finder.index_urls = []
        if opts.extra_index_urls:
            finder.index_urls.extend(opts.extra_index_urls)
        if opts.find_links:
            # FIXME: it would be nice to keep track of the source
            # of the find_links: support a find-links local path
            # relative to a requirements file.
            value = opts.find_links[0]
            req_dir = os.path.dirname(os.path.abspath(filename))
            relative_to_reqs_file = os.path.join(req_dir, value)
            if os.path.exists(relative_to_reqs_file):
                value = relative_to_reqs_file
            finder.find_links.append(value)
        if opts.pre:
            finder.allow_all_prereleases = True
        if opts.process_dependency_links:
            finder.process_dependency_links = True
        if opts.trusted_hosts:
            finder.secure_origins.extend(
                ("*", host, "*") for host in opts.trusted_hosts)
Esempio n. 10
0
 def _url_for_path(self, path):
     return urllib_parse.urljoin(self.url, path)
Esempio n. 11
0
    def get_page(cls, link, skip_archives=True, session=None):
        if session is None:
            raise TypeError(
                "get_page() missing 1 required keyword argument: 'session'"
            )

        url = link.url
        url = url.split('#', 1)[0]

        # Check for VCS schemes that do not support lookup as web pages.
        from pipenv.patched.notpip._internal.vcs import VcsSupport
        for scheme in VcsSupport.schemes:
            if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
                logger.debug('Cannot look at %s URL %s', scheme, link)
                return None

        try:
            if skip_archives:
                filename = link.filename
                for bad_ext in ARCHIVE_EXTENSIONS:
                    if filename.endswith(bad_ext):
                        content_type = cls._get_content_type(
                            url, session=session,
                        )
                        if content_type.lower().startswith('text/html'):
                            break
                        else:
                            logger.debug(
                                'Skipping page %s because of Content-Type: %s',
                                link,
                                content_type,
                            )
                            return

            logger.debug('Getting page %s', url)

            # Tack index.html onto file:// URLs that point to directories
            (scheme, netloc, path, params, query, fragment) = \
                urllib_parse.urlparse(url)
            if (scheme == 'file' and
                    os.path.isdir(urllib_request.url2pathname(path))):
                # add trailing slash if not present so urljoin doesn't trim
                # final segment
                if not url.endswith('/'):
                    url += '/'
                url = urllib_parse.urljoin(url, 'index.html')
                logger.debug(' file: URL is directory, getting %s', url)

            resp = session.get(
                url,
                headers={
                    "Accept": "text/html",
                    "Cache-Control": "max-age=600",
                },
            )
            resp.raise_for_status()

            # The check for archives above only works if the url ends with
            # something that looks like an archive. However that is not a
            # requirement of an url. Unless we issue a HEAD request on every
            # url we cannot know ahead of time for sure if something is HTML
            # or not. However we can check after we've downloaded it.
            content_type = resp.headers.get('Content-Type', 'unknown')
            if not content_type.lower().startswith("text/html"):
                logger.debug(
                    'Skipping page %s because of Content-Type: %s',
                    link,
                    content_type,
                )
                return

            inst = cls(resp.content, resp.url, resp.headers)
        except requests.HTTPError as exc:
            cls._handle_fail(link, exc, url)
        except SSLError as exc:
            reason = "There was a problem confirming the ssl certificate: "
            reason += str(exc)
            cls._handle_fail(link, reason, url, meth=logger.info)
        except requests.ConnectionError as exc:
            cls._handle_fail(link, "connection error: %s" % exc, url)
        except requests.Timeout:
            cls._handle_fail(link, "timed out", url)
        else:
            return inst
Esempio n. 12
0
    def get_page(cls, link, skip_archives=True, session=None):
        if session is None:
            raise TypeError(
                "get_page() missing 1 required keyword argument: 'session'")

        url = link.url
        url = url.split('#', 1)[0]

        # Check for VCS schemes that do not support lookup as web pages.
        from pipenv.patched.notpip._internal.vcs import VcsSupport
        for scheme in VcsSupport.schemes:
            if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
                logger.debug('Cannot look at %s URL %s', scheme, link)
                return None

        try:
            if skip_archives:
                filename = link.filename
                for bad_ext in ARCHIVE_EXTENSIONS:
                    if filename.endswith(bad_ext):
                        content_type = cls._get_content_type(
                            url,
                            session=session,
                        )
                        if content_type.lower().startswith('text/html'):
                            break
                        else:
                            logger.debug(
                                'Skipping page %s because of Content-Type: %s',
                                link,
                                content_type,
                            )
                            return

            logger.debug('Getting page %s', url)

            # Tack index.html onto file:// URLs that point to directories
            (scheme, netloc, path, params, query, fragment) = \
                urllib_parse.urlparse(url)
            if (scheme == 'file'
                    and os.path.isdir(urllib_request.url2pathname(path))):
                # add trailing slash if not present so urljoin doesn't trim
                # final segment
                if not url.endswith('/'):
                    url += '/'
                url = urllib_parse.urljoin(url, 'index.html')
                logger.debug(' file: URL is directory, getting %s', url)

            resp = session.get(
                url,
                headers={
                    "Accept": "text/html",
                    "Cache-Control": "max-age=600",
                },
            )
            resp.raise_for_status()

            # The check for archives above only works if the url ends with
            # something that looks like an archive. However that is not a
            # requirement of an url. Unless we issue a HEAD request on every
            # url we cannot know ahead of time for sure if something is HTML
            # or not. However we can check after we've downloaded it.
            content_type = resp.headers.get('Content-Type', 'unknown')
            if not content_type.lower().startswith("text/html"):
                logger.debug(
                    'Skipping page %s because of Content-Type: %s',
                    link,
                    content_type,
                )
                return

            inst = cls(resp.content, resp.url, resp.headers)
        except requests.HTTPError as exc:
            cls._handle_fail(link, exc, url)
        except SSLError as exc:
            reason = "There was a problem confirming the ssl certificate: "
            reason += str(exc)
            cls._handle_fail(link, reason, url, meth=logger.info)
        except requests.ConnectionError as exc:
            cls._handle_fail(link, "connection error: %s" % exc, url)
        except requests.Timeout:
            cls._handle_fail(link, "timed out", url)
        else:
            return inst
 def _url_for_path(self, path):
     # type: (str) -> str
     return urllib_parse.urljoin(self.url, path)
Esempio n. 14
0
 def url_to_path(self, path):
     return urllib_parse.urljoin(self.url, path)
Esempio n. 15
0
def process_line(line, filename, line_number, finder=None, comes_from=None,
                 options=None, session=None, wheel_cache=None,
                 constraint=False):
    """Process a single requirements line; This can result in creating/yielding
    requirements, or updating the finder.

    For lines that contain requirements, the only options that have an effect
    are from SUPPORTED_OPTIONS_REQ, and they are scoped to the
    requirement. Other options from SUPPORTED_OPTIONS may be present, but are
    ignored.

    For lines that do not contain requirements, the only options that have an
    effect are from SUPPORTED_OPTIONS. Options from SUPPORTED_OPTIONS_REQ may
    be present, but are ignored. These lines may contain multiple options
    (although our docs imply only one is supported), and all our parsed and
    affect the finder.

    :param constraint: If True, parsing a constraints file.
    :param options: OptionParser options that we may update
    """
    parser = build_parser(line)
    defaults = parser.get_default_values()
    defaults.index_url = None
    if finder:
        # `finder.format_control` will be updated during parsing
        defaults.format_control = finder.format_control
    args_str, options_str = break_args_options(line)
    if sys.version_info < (2, 7, 3):
        # Prior to 2.7.3, shlex cannot deal with unicode entries
        options_str = options_str.encode('utf8')
    opts, _ = parser.parse_args(shlex.split(options_str), defaults)

    # preserve for the nested code path
    line_comes_from = '%s %s (line %s)' % (
        '-c' if constraint else '-r', filename, line_number,
    )

    # yield a line requirement
    if args_str:
        isolated = options.isolated_mode if options else False
        if options:
            cmdoptions.check_install_build_global(options, opts)
        # get the options that apply to requirements
        req_options = {}
        for dest in SUPPORTED_OPTIONS_REQ_DEST:
            if dest in opts.__dict__ and opts.__dict__[dest]:
                req_options[dest] = opts.__dict__[dest]
        yield InstallRequirement.from_line(
            args_str, line_comes_from, constraint=constraint,
            isolated=isolated, options=req_options, wheel_cache=wheel_cache
        )

    # yield an editable requirement
    elif opts.editables:
        isolated = options.isolated_mode if options else False
        yield InstallRequirement.from_editable(
            opts.editables[0], comes_from=line_comes_from,
            constraint=constraint, isolated=isolated, wheel_cache=wheel_cache
        )

    # parse a nested requirements file
    elif opts.requirements or opts.constraints:
        if opts.requirements:
            req_path = opts.requirements[0]
            nested_constraint = False
        else:
            req_path = opts.constraints[0]
            nested_constraint = True
        # original file is over http
        if SCHEME_RE.search(filename):
            # do a url join so relative paths work
            req_path = urllib_parse.urljoin(filename, req_path)
        # original file and nested file are paths
        elif not SCHEME_RE.search(req_path):
            # do a join so relative paths work
            req_path = os.path.join(os.path.dirname(filename), req_path)
        # TODO: Why not use `comes_from='-r {} (line {})'` here as well?
        parser = parse_requirements(
            req_path, finder, comes_from, options, session,
            constraint=nested_constraint, wheel_cache=wheel_cache
        )
        for req in parser:
            yield req

    # percolate hash-checking option upward
    elif opts.require_hashes:
        options.require_hashes = opts.require_hashes

    # set finder options
    elif finder:
        if opts.index_url:
            finder.index_urls = [opts.index_url]
        if opts.no_index is True:
            finder.index_urls = []
        if opts.extra_index_urls:
            finder.index_urls.extend(opts.extra_index_urls)
        if opts.find_links:
            # FIXME: it would be nice to keep track of the source
            # of the find_links: support a find-links local path
            # relative to a requirements file.
            value = opts.find_links[0]
            req_dir = os.path.dirname(os.path.abspath(filename))
            relative_to_reqs_file = os.path.join(req_dir, value)
            if os.path.exists(relative_to_reqs_file):
                value = relative_to_reqs_file
            finder.find_links.append(value)
        if opts.pre:
            finder.allow_all_prereleases = True
        if opts.process_dependency_links:
            finder.process_dependency_links = True
        if opts.trusted_hosts:
            finder.secure_origins.extend(
                ("*", host, "*") for host in opts.trusted_hosts)
Esempio n. 16
0
def _get_html_page(link, session=None):
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'")

    url = link.url
    url = url.split('#', 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    from pipenv.patched.notpip._internal.vcs import VcsSupport
    for scheme in VcsSupport.schemes:
        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
            logger.debug('Cannot look at %s URL %s', scheme, link)
            return None

    try:
        filename = link.filename
        for bad_ext in ARCHIVE_EXTENSIONS:
            if filename.endswith(bad_ext):
                content_type = _get_content_type(url, session=session)
                if content_type.lower().startswith('text/html'):
                    break
                else:
                    logger.debug(
                        'Skipping page %s because of Content-Type: %s',
                        link,
                        content_type,
                    )
                    return

        logger.debug('Getting page %s', url)

        # Tack index.html onto file:// URLs that point to directories
        (scheme, netloc, path, params, query, fragment) = \
            urllib_parse.urlparse(url)
        if (scheme == 'file'
                and os.path.isdir(urllib_request.url2pathname(path))):
            # add trailing slash if not present so urljoin doesn't trim
            # final segment
            if not url.endswith('/'):
                url += '/'
            url = urllib_parse.urljoin(url, 'index.html')
            logger.debug(' file: URL is directory, getting %s', url)

        resp = session.get(
            url,
            headers={
                "Accept": "text/html",
                # We don't want to blindly returned cached data for
                # /simple/, because authors generally expecting that
                # twine upload && pip install will function, but if
                # they've done a pip install in the last ~10 minutes
                # it won't. Thus by setting this to zero we will not
                # blindly use any cached data, however the benefit of
                # using max-age=0 instead of no-cache, is that we will
                # still support conditional requests, so we will still
                # minimize traffic sent in cases where the page hasn't
                # changed at all, we will just always incur the round
                # trip for the conditional GET now instead of only
                # once per 10 minutes.
                # For more information, please see pypa/pip#5670.
                "Cache-Control": "max-age=0",
            },
        )
        resp.raise_for_status()

        # The check for archives above only works if the url ends with
        # something that looks like an archive. However that is not a
        # requirement of an url. Unless we issue a HEAD request on every
        # url we cannot know ahead of time for sure if something is HTML
        # or not. However we can check after we've downloaded it.
        content_type = resp.headers.get('Content-Type', 'unknown')
        if not content_type.lower().startswith("text/html"):
            logger.debug(
                'Skipping page %s because of Content-Type: %s',
                link,
                content_type,
            )
            return

        inst = HTMLPage(resp.content, resp.url, resp.headers)
    except requests.HTTPError as exc:
        _handle_get_page_fail(link, exc, url)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, url, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, "connection error: %s" % exc, url)
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out", url)
    else:
        return inst