Example #1
0
    def process_url(self, url, retrieve=False):
        """Evaluate a URL as a possible download, and maybe retrieve it"""
        if url in self.scanned_urls and not retrieve:
            return
        self.scanned_urls[url] = True
        if not URL_SCHEME(url):
            self.process_filename(url)
            return
        else:
            dists = list(distros_for_url(url))
            if dists:
                if not self.url_ok(url):
                    return
                self.debug("Found link: %s", url)

        if dists or not retrieve or url in self.fetched_urls:
            map(self.add, dists)
            return  # don't need the actual page

        if not self.url_ok(url):
            self.fetched_urls[url] = True
            return

        self.info("Reading %s", url)

        f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url)
        if f is None:
            return
        self.fetched_urls[url] = self.fetched_urls[f.url] = True

        if "html" not in f.headers.get("content-type", "").lower():
            f.close()  # not html, we can't process it
            return

        base = f.url  # handle redirects
        page = f.read()
        if not isinstance(page, str):  # We are in Python 3 and got bytes. We want str.
            if isinstance(f, urllib2.HTTPError):
                # Errors have no charset, assume latin1:
                charset = "latin-1"
            else:
                charset = f.headers.get_param("charset") or "latin-1"
            page = page.decode(charset, "ignore")
        f.close()
        for match in HREF.finditer(page):
            link = urlparse.urljoin(base, htmldecode(match.group(1)))
            self.process_url(link)

        for index_url in self.index_urls:
            if url.startswith(index_url) and getattr(f, "code", None) != 404:
                page = self.process_index(url, page)
Example #2
0
    def process_index(self, url, page):
        """Process the contents of a PyPI page
        Override: don't lowercase package name
        """
        if ALWAYS_REFRESH:
            # Zap ourselves from the fetched url list, otherwise we'll
            # never be updated as long as the server runs.
            del self.fetched_urls[url]

        def scan(link):
            # Process a URL to see if it's for a package page
            if link.startswith(self.index_url):
                parts = map(
                    urllib2.unquote, link[len(self.index_url):].split('/')
                )
                if len(parts)==2 and '#' not in parts[1]:
                    # it's a package page, sanitize and index it
                    pkg = safe_name(parts[0])
                    ver = safe_version(parts[1])
                    # changed "pkg.lower()" to "pkg"
                    self.package_pages.setdefault(pkg, {})[link] = True
                    return to_filename(pkg), to_filename(ver)
            return None, None

        # process an index page into the package-page index
        for match in HREF.finditer(page):
            scan( urlparse.urljoin(url, htmldecode(match.group(1))) )

        pkg, ver = scan(url)   # ensure this page is in the page index
        if pkg:
            # process individual package page
            for new_url in find_external_links(url, page):
                # Process the found URL
                base, frag = egg_info_for_url(new_url)
                if base.endswith('.py') and not frag:
                    if ver:
                        new_url+='#egg=%s-%s' % (pkg,ver)
                    else:
                        self.need_version_info(url)
                self.scan_url(new_url)

            return PYPI_MD5.sub(
                lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page
            )
        else:
            return ""   # no sense double-scanning non-package pages
Example #3
0
    def process_index(self, url, page):
        """Process the contents of a PyPI page
        Override: don't lowercase package name
        """
        if ALWAYS_REFRESH:
            # Zap ourselves from the fetched url list, otherwise we'll
            # never be updated as long as the server runs.
            del self.fetched_urls[url]

        def scan(link):
            # Process a URL to see if it's for a package page
            if link.startswith(self.index_url):
                parts = map(urllib2.unquote,
                            link[len(self.index_url):].split('/'))
                if len(parts) == 2 and '#' not in parts[1]:
                    # it's a package page, sanitize and index it
                    pkg = safe_name(parts[0])
                    ver = safe_version(parts[1])
                    # changed "pkg.lower()" to "pkg"
                    self.package_pages.setdefault(pkg, {})[link] = True
                    return to_filename(pkg), to_filename(ver)
            return None, None

        # process an index page into the package-page index
        for match in HREF.finditer(page):
            scan(urlparse.urljoin(url, htmldecode(match.group(1))))

        pkg, ver = scan(url)  # ensure this page is in the page index
        if pkg:
            # process individual package page
            for new_url in find_external_links(url, page):
                # Process the found URL
                base, frag = egg_info_for_url(new_url)
                if base.endswith('.py') and not frag:
                    if ver:
                        new_url += '#egg=%s-%s' % (pkg, ver)
                    else:
                        self.need_version_info(url)
                self.scan_url(new_url)

            return PYPI_MD5.sub(
                lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2),
                page)
        else:
            return ""  # no sense double-scanning non-package pages
Example #4
0
    def process_index(self, url, page):
        """Process the contents of a PyPI page
        Override: don't lowercase package name
        """

        def scan(link):
            # Process a URL to see if it's for a package page
            if link.startswith(self.index_url):
                parts = map(urllib2.unquote, link[len(self.index_url) :].split("/"))
                if len(parts) == 2 and "#" not in parts[1]:
                    # it's a package page, sanitize and index it
                    pkg = safe_name(parts[0])
                    ver = safe_version(parts[1])
                    # changed "pkg.lower()" to "pkg"
                    self.package_pages.setdefault(pkg, {})[link] = True
                    return to_filename(pkg), to_filename(ver)
            return None, None

        # process an index page into the package-page index
        for match in HREF.finditer(page):
            try:
                scan(urlparse.urljoin(url, htmldecode(match.group(1))))
            except ValueError:
                pass

        pkg, ver = scan(url)  # ensure this page is in the page index
        if pkg:
            # process individual package page
            for new_url in find_external_links(url, page):
                # Process the found URL
                base, frag = egg_info_for_url(new_url)
                if base.endswith(".py") and not frag:
                    if ver:
                        new_url += "#egg=%s-%s" % (pkg, ver)
                    else:
                        self.need_version_info(url)
                self.scan_url(new_url)

            return PYPI_MD5.sub(lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page)
        else:
            return ""  # no sense double-scanning non-package pages
Example #5
0
        def process_url(self, url, retrieve=False):
            """Evaluate a URL as a possible download, and maybe retrieve it"""
            if url in self.scanned_urls and not retrieve:
                return
            self.scanned_urls[url] = True
            if not URL_SCHEME(url):
                self.process_filename(url)
                return
            else:
                dists = list(distros_for_url(url))
                if dists:
                    if not self.url_ok(url):
                        return
                    self.debug("Found link: %s", url)

            if dists or not retrieve or url in self.fetched_urls:
                list(map(self.add, dists))
                return  # don't need the actual page

            if not self.url_ok(url):
                self.fetched_urls[url] = True
                return

            self.info("Reading %s", url)
            self.fetched_urls[url] = True  # prevent multiple fetch attempts
            tmpl = "Download error on %s: %%s -- Some packages may not be found!"
            f = self.open_url(url, tmpl % url)
            if f is None:
                return
            if isinstance(f, urllib.error.HTTPError) and f.code == 401:
                self.info("Authentication error: %s" % f.msg)
            self.fetched_urls[f.url] = True
            if 'html' not in f.headers.get('content-type', '').lower():
                f.close()  # not html, we can't process it
                return

            base = f.url  # handle redirects
            page = f.read()

            # --- LOCAL CHANGES MADE HERE: ---

            if isinstance(page, six.text_type):
                page = page.encode('utf8')
                charset = 'utf8'
            else:
                if isinstance(f, urllib.error.HTTPError):
                    # Errors have no charset, assume latin1:
                    charset = 'latin-1'
                else:
                    try:
                        charset = f.headers.get_param('charset') or 'latin-1'
                    except AttributeError:
                        # Python 2
                        charset = f.headers.getparam('charset') or 'latin-1'
            try:
                html_page = HTMLPage(page, charset, base, cache_link_parsing=False)
            except TypeError:
                html_page = HTMLPage(page, charset, base)

            # https://github.com/buildout/buildout/issues/598
            # use_deprecated_html5lib is a required addition in pip 22.
            try:
                plinks = parse_links(html_page, use_deprecated_html5lib=False)
            except TypeError:
                plinks = parse_links(html_page)
            plinks = list(plinks)
            pip_links = [l.url for l in plinks]

            # --- END OF LOCAL CHANGES ---

            if not isinstance(page, str):
                # In Python 3 and got bytes but want str.
                page = page.decode(charset, "ignore")
            f.close()

            # --- LOCAL CHANGES MADE HERE: ---

            links = []
            for match in HREF.finditer(page):
                link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
                links.append(_clean_link(link))

            # TODO: remove assertion and double index page parsing before releasing.
            assert set(pip_links) == set(links)

            for link in plinks:
                if _check_link_requires_python(link, PY_VERSION_INFO):
                    self.process_url(link.url)

            # --- END OF LOCAL CHANGES ---

            if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
                page = self.process_index(url, page)