def find_eggs_in_url(url): """Read a URL and find any links to egg files Parameters ---------- url : string the url to search Returns ------- dists : a list of pkg_resources.Distribution objects """ if url in cache: return cache[url] page = urlopen(url) try: info = page.info() content = page.read() finally: page.close() if info['Content-Type'].split(';')[0].strip() == 'text/html': dists = [] for match in HREF.finditer(content): ref = match.group(1) schema, location, path, query, frag = urlsplit(ref) if location == '': ref = urljoin(url, ref) dists += distros_for_url(ref) cache[url] = dists return dists
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: map(self.add, dists) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url) if f is None: return self.fetched_urls[url] = self.fetched_urls[f.url] = True if "html" not in f.headers.get("content-type", "").lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() if not isinstance(page, str): # We are in Python 3 and got bytes. We want str. if isinstance(f, urllib2.HTTPError): # Errors have no charset, assume latin1: charset = "latin-1" else: charset = f.headers.get_param("charset") or "latin-1" page = page.decode(charset, "ignore") f.close() for match in HREF.finditer(page): link = urlparse.urljoin(base, htmldecode(match.group(1))) self.process_url(link) for index_url in self.index_urls: if url.startswith(index_url) and getattr(f, "code", None) != 404: page = self.process_index(url, page)
def compute_version(filename): match = WHEEL_RE.match(filename) if match: return match.group("ver") try: distro = next(distros_for_url(filename)) except StopIteration: logger.info({ "event": "download_statitics.compute_version.ignore", "filename": filename }) return None else: return distro.version
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: list(map(self.add, dists)) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) self.fetched_urls[url] = True # prevent multiple fetch attempts tmpl = "Download error on %s: %%s -- Some packages may not be found!" f = self.open_url(url, tmpl % url) if f is None: return if isinstance(f, urllib.error.HTTPError) and f.code == 401: self.info("Authentication error: %s" % f.msg) self.fetched_urls[f.url] = True if 'html' not in f.headers.get('content-type', '').lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() # --- LOCAL CHANGES MADE HERE: --- if isinstance(page, six.text_type): page = page.encode('utf8') charset = 'utf8' else: if isinstance(f, urllib.error.HTTPError): # Errors have no charset, assume latin1: charset = 'latin-1' else: try: charset = f.headers.get_param('charset') or 'latin-1' except AttributeError: # Python 2 charset = f.headers.getparam('charset') or 'latin-1' try: html_page = HTMLPage(page, charset, base, cache_link_parsing=False) except TypeError: html_page = HTMLPage(page, charset, base) # https://github.com/buildout/buildout/issues/598 # use_deprecated_html5lib is a required addition in pip 22. try: plinks = parse_links(html_page, use_deprecated_html5lib=False) except TypeError: plinks = parse_links(html_page) plinks = list(plinks) pip_links = [l.url for l in plinks] # --- END OF LOCAL CHANGES --- if not isinstance(page, str): # In Python 3 and got bytes but want str. page = page.decode(charset, "ignore") f.close() # --- LOCAL CHANGES MADE HERE: --- links = [] for match in HREF.finditer(page): link = urllib.parse.urljoin(base, htmldecode(match.group(1))) links.append(_clean_link(link)) # TODO: remove assertion and double index page parsing before releasing. assert set(pip_links) == set(links) for link in plinks: if _check_link_requires_python(link, PY_VERSION_INFO): self.process_url(link.url) # --- END OF LOCAL CHANGES --- if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: page = self.process_index(url, page)
class Package(models.Model): index = models.ForeignKey(PackageIndex) name = models.CharField(max_length=255, unique=True, primary_key=True) auto_hide = models.BooleanField(default=True, blank=False) updated_from_remote_at = models.DateTimeField(null=True, blank=True) parsed_external_links_at = models.DateTimeField(null=True, blank=True) class Meta: verbose_name = _(u"package") verbose_name_plural = _(u"packages") get_latest_by = "releases__latest" ordering = [ 'name', ] def __unicode__(self): return self.name @models.permalink def get_absolute_url(self): return ('packageindex-package', (), {'package': self.name}) @property def latest(self): try: return self.releases.latest() except Release.DoesNotExist: return None def get_release(self, version): """Return the release object for version, or None""" try: return self.releases.get(version=version) except Release.DoesNotExist: return None def update_release_metadata(self, update_distribution_metadata=True): now = datetime.datetime.now() try: name = self.name.encode('ascii') except UnicodeEncodeError: print "illegal package name!" return for release_string in self.index.client.package_releases( self.name, True): # True -> show hidden data = self.index.client.release_data(self.name, release_string) kwargs = { 'hidden': data.get('_pypi_hidden', False), 'package_info': MultiValueDict(), 'is_from_external': False, } for key, value in data.items(): kwargs['package_info'][key] = value release, created = Release.objects.get_or_create( package=self, version=release_string, defaults=kwargs) if not created: for key, value in kwargs.items(): setattr(release, key, value) release.save() if update_distribution_metadata: release.update_distribution_metatdata() self.updated_from_remote_at = now self.save() def update_external_release_metadata(self, update_distribution_metadata=True): try: name = self.name.encode('ascii') except UnicodeEncodeError: print "illegal package name!" return mpackage = mirror.Package(package_name=name, pypi_base_url=self.index.simple_url) try: files = mpackage.ls(filename_matches='*', external_links=True, follow_external_index_pages=True) except (PackageError, ), e: print type(e), e files = [] for (dist_url, file_name, md5sum) in files: if dist_url.startswith('../../'): # Ignore relative urls, as they are files hosted on pypi and have already been fetched over the xml-rpc # api continue i = 1 for dist in distros_for_url(dist_url): if not dist.project_name == self.name or not dist.version: continue release = Release.objects.get_or_create( package=self, version=dist.version, defaults={'is_from_external': True})[0] pyversion = dist.py_version or 'any' f, ext = os.path.splitext(file_name) if ext.startswith('.egg'): filetype = 'bdist_egg' elif ext in ('.exe', ): filetype = 'bdist_wininst' elif ext in ('.dmg', '.pgk'): filetype = 'bdist_dmg' elif ext in ('.rpm', ): filetype = 'bdist_rpm' elif ext in ('.tar.gz', '.zip', '.bz2'): filetype = 'sdist' else: continue defaults = { 'filename': file_name, 'url': dist_url, 'is_from_external': True } distribution = Distribution.objects.get_or_create( release=release, pyversion=pyversion, filetype=filetype, defaults=defaults)[0] if distribution.is_from_external and not distribution.file: # we only overwrite the url if the package has not been mirrored yet and it is not a real pypi # hosted package distribution.filename = file_name distribution.url = dist_url distribution.save() print i, dist.project_name, dist.py_version, dist.version, distribution i += 1 self.parsed_external_links_at = datetime.datetime.now() self.save()
def version_for_url(project, url): normalized = safe_name(project).lower() return [dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized][0].version
def installable(project, url): normalized = safe_name(project).lower() return bool([dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized])
def get_distro(url): return next(distros_for_url(url))
def version_for_url(project, url): normalized = safe_name(project).lower() return [ dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized ][0].version
def installable(project, url): normalized = safe_name(project).lower() return bool([ dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized ])