def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: map(self.add, dists) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url) if f is None: return self.fetched_urls[url] = self.fetched_urls[f.url] = True if "html" not in f.headers.get("content-type", "").lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() if not isinstance(page, str): # We are in Python 3 and got bytes. We want str. if isinstance(f, urllib2.HTTPError): # Errors have no charset, assume latin1: charset = "latin-1" else: charset = f.headers.get_param("charset") or "latin-1" page = page.decode(charset, "ignore") f.close() for match in HREF.finditer(page): link = urlparse.urljoin(base, htmldecode(match.group(1))) self.process_url(link) for index_url in self.index_urls: if url.startswith(index_url) and getattr(f, "code", None) != 404: page = self.process_index(url, page)
def process_index(self, url, page): """Process the contents of a PyPI page Override: don't lowercase package name """ if ALWAYS_REFRESH: # Zap ourselves from the fetched url list, otherwise we'll # never be updated as long as the server runs. del self.fetched_urls[url] def scan(link): # Process a URL to see if it's for a package page if link.startswith(self.index_url): parts = map( urllib2.unquote, link[len(self.index_url):].split('/') ) if len(parts)==2 and '#' not in parts[1]: # it's a package page, sanitize and index it pkg = safe_name(parts[0]) ver = safe_version(parts[1]) # changed "pkg.lower()" to "pkg" self.package_pages.setdefault(pkg, {})[link] = True return to_filename(pkg), to_filename(ver) return None, None # process an index page into the package-page index for match in HREF.finditer(page): scan( urlparse.urljoin(url, htmldecode(match.group(1))) ) pkg, ver = scan(url) # ensure this page is in the page index if pkg: # process individual package page for new_url in find_external_links(url, page): # Process the found URL base, frag = egg_info_for_url(new_url) if base.endswith('.py') and not frag: if ver: new_url+='#egg=%s-%s' % (pkg,ver) else: self.need_version_info(url) self.scan_url(new_url) return PYPI_MD5.sub( lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page ) else: return "" # no sense double-scanning non-package pages
def process_index(self, url, page): """Process the contents of a PyPI page Override: don't lowercase package name """ if ALWAYS_REFRESH: # Zap ourselves from the fetched url list, otherwise we'll # never be updated as long as the server runs. del self.fetched_urls[url] def scan(link): # Process a URL to see if it's for a package page if link.startswith(self.index_url): parts = map(urllib2.unquote, link[len(self.index_url):].split('/')) if len(parts) == 2 and '#' not in parts[1]: # it's a package page, sanitize and index it pkg = safe_name(parts[0]) ver = safe_version(parts[1]) # changed "pkg.lower()" to "pkg" self.package_pages.setdefault(pkg, {})[link] = True return to_filename(pkg), to_filename(ver) return None, None # process an index page into the package-page index for match in HREF.finditer(page): scan(urlparse.urljoin(url, htmldecode(match.group(1)))) pkg, ver = scan(url) # ensure this page is in the page index if pkg: # process individual package page for new_url in find_external_links(url, page): # Process the found URL base, frag = egg_info_for_url(new_url) if base.endswith('.py') and not frag: if ver: new_url += '#egg=%s-%s' % (pkg, ver) else: self.need_version_info(url) self.scan_url(new_url) return PYPI_MD5.sub( lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page) else: return "" # no sense double-scanning non-package pages
def process_index(self, url, page): """Process the contents of a PyPI page Override: don't lowercase package name """ def scan(link): # Process a URL to see if it's for a package page if link.startswith(self.index_url): parts = map(urllib2.unquote, link[len(self.index_url) :].split("/")) if len(parts) == 2 and "#" not in parts[1]: # it's a package page, sanitize and index it pkg = safe_name(parts[0]) ver = safe_version(parts[1]) # changed "pkg.lower()" to "pkg" self.package_pages.setdefault(pkg, {})[link] = True return to_filename(pkg), to_filename(ver) return None, None # process an index page into the package-page index for match in HREF.finditer(page): try: scan(urlparse.urljoin(url, htmldecode(match.group(1)))) except ValueError: pass pkg, ver = scan(url) # ensure this page is in the page index if pkg: # process individual package page for new_url in find_external_links(url, page): # Process the found URL base, frag = egg_info_for_url(new_url) if base.endswith(".py") and not frag: if ver: new_url += "#egg=%s-%s" % (pkg, ver) else: self.need_version_info(url) self.scan_url(new_url) return PYPI_MD5.sub(lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page) else: return "" # no sense double-scanning non-package pages
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: list(map(self.add, dists)) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) self.fetched_urls[url] = True # prevent multiple fetch attempts tmpl = "Download error on %s: %%s -- Some packages may not be found!" f = self.open_url(url, tmpl % url) if f is None: return if isinstance(f, urllib.error.HTTPError) and f.code == 401: self.info("Authentication error: %s" % f.msg) self.fetched_urls[f.url] = True if 'html' not in f.headers.get('content-type', '').lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() # --- LOCAL CHANGES MADE HERE: --- if isinstance(page, six.text_type): page = page.encode('utf8') charset = 'utf8' else: if isinstance(f, urllib.error.HTTPError): # Errors have no charset, assume latin1: charset = 'latin-1' else: try: charset = f.headers.get_param('charset') or 'latin-1' except AttributeError: # Python 2 charset = f.headers.getparam('charset') or 'latin-1' try: html_page = HTMLPage(page, charset, base, cache_link_parsing=False) except TypeError: html_page = HTMLPage(page, charset, base) # https://github.com/buildout/buildout/issues/598 # use_deprecated_html5lib is a required addition in pip 22. try: plinks = parse_links(html_page, use_deprecated_html5lib=False) except TypeError: plinks = parse_links(html_page) plinks = list(plinks) pip_links = [l.url for l in plinks] # --- END OF LOCAL CHANGES --- if not isinstance(page, str): # In Python 3 and got bytes but want str. page = page.decode(charset, "ignore") f.close() # --- LOCAL CHANGES MADE HERE: --- links = [] for match in HREF.finditer(page): link = urllib.parse.urljoin(base, htmldecode(match.group(1))) links.append(_clean_link(link)) # TODO: remove assertion and double index page parsing before releasing. assert set(pip_links) == set(links) for link in plinks: if _check_link_requires_python(link, PY_VERSION_INFO): self.process_url(link.url) # --- END OF LOCAL CHANGES --- if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: page = self.process_index(url, page)