Beispiel #1
0
def find_eggs_in_url(url):
    """Read a URL and find any links to egg files

    Parameters
    ----------
    url : string
        the url to search

    Returns
    -------
    dists : a list of pkg_resources.Distribution objects
    """
    if url in cache:
        return cache[url]
    page = urlopen(url)
    try:
        info = page.info()
        content = page.read()
    finally:
        page.close()
    if info['Content-Type'].split(';')[0].strip() == 'text/html':
        dists = []
        for match in HREF.finditer(content):
            ref = match.group(1)
            schema, location, path, query, frag = urlsplit(ref)
            if location == '':
                ref = urljoin(url, ref)
            dists += distros_for_url(ref)
    cache[url] = dists
    return dists
Beispiel #2
0
    def process_url(self, url, retrieve=False):
        """Evaluate a URL as a possible download, and maybe retrieve it"""
        if url in self.scanned_urls and not retrieve:
            return
        self.scanned_urls[url] = True
        if not URL_SCHEME(url):
            self.process_filename(url)
            return
        else:
            dists = list(distros_for_url(url))
            if dists:
                if not self.url_ok(url):
                    return
                self.debug("Found link: %s", url)

        if dists or not retrieve or url in self.fetched_urls:
            map(self.add, dists)
            return  # don't need the actual page

        if not self.url_ok(url):
            self.fetched_urls[url] = True
            return

        self.info("Reading %s", url)

        f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url)
        if f is None:
            return
        self.fetched_urls[url] = self.fetched_urls[f.url] = True

        if "html" not in f.headers.get("content-type", "").lower():
            f.close()  # not html, we can't process it
            return

        base = f.url  # handle redirects
        page = f.read()
        if not isinstance(page, str):  # We are in Python 3 and got bytes. We want str.
            if isinstance(f, urllib2.HTTPError):
                # Errors have no charset, assume latin1:
                charset = "latin-1"
            else:
                charset = f.headers.get_param("charset") or "latin-1"
            page = page.decode(charset, "ignore")
        f.close()
        for match in HREF.finditer(page):
            link = urlparse.urljoin(base, htmldecode(match.group(1)))
            self.process_url(link)

        for index_url in self.index_urls:
            if url.startswith(index_url) and getattr(f, "code", None) != 404:
                page = self.process_index(url, page)
Beispiel #3
0
def compute_version(filename):
    match = WHEEL_RE.match(filename)
    if match:
        return match.group("ver")
    try:
        distro = next(distros_for_url(filename))
    except StopIteration:
        logger.info({
            "event": "download_statitics.compute_version.ignore",
            "filename": filename
        })
        return None
    else:
        return distro.version
Beispiel #4
0
        def process_url(self, url, retrieve=False):
            """Evaluate a URL as a possible download, and maybe retrieve it"""
            if url in self.scanned_urls and not retrieve:
                return
            self.scanned_urls[url] = True
            if not URL_SCHEME(url):
                self.process_filename(url)
                return
            else:
                dists = list(distros_for_url(url))
                if dists:
                    if not self.url_ok(url):
                        return
                    self.debug("Found link: %s", url)

            if dists or not retrieve or url in self.fetched_urls:
                list(map(self.add, dists))
                return  # don't need the actual page

            if not self.url_ok(url):
                self.fetched_urls[url] = True
                return

            self.info("Reading %s", url)
            self.fetched_urls[url] = True  # prevent multiple fetch attempts
            tmpl = "Download error on %s: %%s -- Some packages may not be found!"
            f = self.open_url(url, tmpl % url)
            if f is None:
                return
            if isinstance(f, urllib.error.HTTPError) and f.code == 401:
                self.info("Authentication error: %s" % f.msg)
            self.fetched_urls[f.url] = True
            if 'html' not in f.headers.get('content-type', '').lower():
                f.close()  # not html, we can't process it
                return

            base = f.url  # handle redirects
            page = f.read()

            # --- LOCAL CHANGES MADE HERE: ---

            if isinstance(page, six.text_type):
                page = page.encode('utf8')
                charset = 'utf8'
            else:
                if isinstance(f, urllib.error.HTTPError):
                    # Errors have no charset, assume latin1:
                    charset = 'latin-1'
                else:
                    try:
                        charset = f.headers.get_param('charset') or 'latin-1'
                    except AttributeError:
                        # Python 2
                        charset = f.headers.getparam('charset') or 'latin-1'
            try:
                html_page = HTMLPage(page, charset, base, cache_link_parsing=False)
            except TypeError:
                html_page = HTMLPage(page, charset, base)

            # https://github.com/buildout/buildout/issues/598
            # use_deprecated_html5lib is a required addition in pip 22.
            try:
                plinks = parse_links(html_page, use_deprecated_html5lib=False)
            except TypeError:
                plinks = parse_links(html_page)
            plinks = list(plinks)
            pip_links = [l.url for l in plinks]

            # --- END OF LOCAL CHANGES ---

            if not isinstance(page, str):
                # In Python 3 and got bytes but want str.
                page = page.decode(charset, "ignore")
            f.close()

            # --- LOCAL CHANGES MADE HERE: ---

            links = []
            for match in HREF.finditer(page):
                link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
                links.append(_clean_link(link))

            # TODO: remove assertion and double index page parsing before releasing.
            assert set(pip_links) == set(links)

            for link in plinks:
                if _check_link_requires_python(link, PY_VERSION_INFO):
                    self.process_url(link.url)

            # --- END OF LOCAL CHANGES ---

            if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
                page = self.process_index(url, page)
Beispiel #5
0
class Package(models.Model):
    index = models.ForeignKey(PackageIndex)
    name = models.CharField(max_length=255, unique=True, primary_key=True)
    auto_hide = models.BooleanField(default=True, blank=False)
    updated_from_remote_at = models.DateTimeField(null=True, blank=True)
    parsed_external_links_at = models.DateTimeField(null=True, blank=True)

    class Meta:
        verbose_name = _(u"package")
        verbose_name_plural = _(u"packages")
        get_latest_by = "releases__latest"
        ordering = [
            'name',
        ]

    def __unicode__(self):
        return self.name

    @models.permalink
    def get_absolute_url(self):
        return ('packageindex-package', (), {'package': self.name})

    @property
    def latest(self):
        try:
            return self.releases.latest()
        except Release.DoesNotExist:
            return None

    def get_release(self, version):
        """Return the release object for version, or None"""
        try:
            return self.releases.get(version=version)
        except Release.DoesNotExist:
            return None

    def update_release_metadata(self, update_distribution_metadata=True):
        now = datetime.datetime.now()
        try:
            name = self.name.encode('ascii')
        except UnicodeEncodeError:
            print "illegal package name!"
            return
        for release_string in self.index.client.package_releases(
                self.name, True):  # True -> show hidden
            data = self.index.client.release_data(self.name, release_string)
            kwargs = {
                'hidden': data.get('_pypi_hidden', False),
                'package_info': MultiValueDict(),
                'is_from_external': False,
            }
            for key, value in data.items():
                kwargs['package_info'][key] = value
            release, created = Release.objects.get_or_create(
                package=self, version=release_string, defaults=kwargs)
            if not created:
                for key, value in kwargs.items():
                    setattr(release, key, value)
                release.save()
            if update_distribution_metadata:
                release.update_distribution_metatdata()
        self.updated_from_remote_at = now
        self.save()

    def update_external_release_metadata(self,
                                         update_distribution_metadata=True):
        try:
            name = self.name.encode('ascii')
        except UnicodeEncodeError:
            print "illegal package name!"
            return
        mpackage = mirror.Package(package_name=name,
                                  pypi_base_url=self.index.simple_url)
        try:
            files = mpackage.ls(filename_matches='*',
                                external_links=True,
                                follow_external_index_pages=True)
        except (PackageError, ), e:
            print type(e), e
            files = []
        for (dist_url, file_name, md5sum) in files:
            if dist_url.startswith('../../'):
                # Ignore relative urls, as they are files hosted on pypi and have already been fetched over the xml-rpc
                # api
                continue
            i = 1
            for dist in distros_for_url(dist_url):
                if not dist.project_name == self.name or not dist.version:
                    continue
                release = Release.objects.get_or_create(
                    package=self,
                    version=dist.version,
                    defaults={'is_from_external': True})[0]
                pyversion = dist.py_version or 'any'
                f, ext = os.path.splitext(file_name)
                if ext.startswith('.egg'):
                    filetype = 'bdist_egg'
                elif ext in ('.exe', ):
                    filetype = 'bdist_wininst'
                elif ext in ('.dmg', '.pgk'):
                    filetype = 'bdist_dmg'
                elif ext in ('.rpm', ):
                    filetype = 'bdist_rpm'
                elif ext in ('.tar.gz', '.zip', '.bz2'):
                    filetype = 'sdist'
                else:
                    continue
                defaults = {
                    'filename': file_name,
                    'url': dist_url,
                    'is_from_external': True
                }
                distribution = Distribution.objects.get_or_create(
                    release=release,
                    pyversion=pyversion,
                    filetype=filetype,
                    defaults=defaults)[0]
                if distribution.is_from_external and not distribution.file:
                    # we only overwrite the url if the package has not been mirrored yet and it is not a real pypi
                    # hosted package
                    distribution.filename = file_name
                    distribution.url = dist_url
                    distribution.save()

                print i, dist.project_name, dist.py_version, dist.version, distribution
                i += 1
        self.parsed_external_links_at = datetime.datetime.now()
        self.save()
def version_for_url(project, url):
    normalized = safe_name(project).lower()
    return [dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized][0].version
def installable(project, url):
    normalized = safe_name(project).lower()
    return bool([dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized])
Beispiel #8
0
 def get_distro(url):
     return next(distros_for_url(url))
Beispiel #9
0
def version_for_url(project, url):
    normalized = safe_name(project).lower()
    return [
        dist for dist in distros_for_url(url)
        if safe_name(dist.project_name).lower() == normalized
    ][0].version
Beispiel #10
0
def installable(project, url):
    normalized = safe_name(project).lower()
    return bool([
        dist for dist in distros_for_url(url)
        if safe_name(dist.project_name).lower() == normalized
    ])
Beispiel #11
0
 def get_distro(url):
     return next(distros_for_url(url))