Python urljoin Examples, pip.compat.urlparse.urljoin Python Examples

Example #1

0

Show file

    def explicit_rel_links(self, rels=('homepage', 'download')):
        """Yields all links with the given relations"""
        rels = set(rels)

        for anchor in self.parsed.findall(".//a"):
            if anchor.get("rel") and anchor.get("href"):
                found_rels = set(anchor.get("rel").split())
                # Determine the intersection between what rels were found and
                #   what rels were being looked for
                if found_rels & rels:
                    href = anchor.get("href")
                    url = self.clean_link(
                        urlparse.urljoin(self.base_url, href)
                    )
                    yield Link(url, self, trusted=False)

Example #2

0

Show file

 def scraped_rel_links(self):
     # Can we get rid of this horrible horrible method?
     for regex in (self._homepage_re, self._download_re):
         match = regex.search(self.content)
         if not match:
             continue
         href_match = self._href_re.search(self.content, pos=match.end())
         if not href_match:
             continue
         url = (
             href_match.group(1)
             or href_match.group(2)
             or href_match.group(3)
         )
         if not url:
             continue
         url = self.clean_link(urlparse.urljoin(self.base_url, url))
         yield Link(url, self, trusted=False, _deprecated_regex=True)

Example #3

0

Show file

    def links(self):
        """Yields all links in the page"""
        for anchor in self.parsed.findall(".//a"):
            if anchor.get("href"):
                href = anchor.get("href")
                url = self.clean_link(urlparse.urljoin(self.base_url, href))

                # Determine if this link is internal. If that distinction
                #   doesn't make sense in this context, then we don't make
                #   any distinction.
                internal = None
                if self.api_version and self.api_version >= 2:
                    # Only api_versions >= 2 have a distinction between
                    #   external and internal links
                    internal = bool(
                        anchor.get("rel")
                        and "internal" in anchor.get("rel").split()
                    )

                yield Link(url, self, internal=internal)

Example #4

0

Show file

File: req_file.py Project: bobobo1618/heroku-buildpack-python

def parse_requirements(filename, finder=None, comes_from=None, options=None,
                       session=None):
    if session is None:
        raise TypeError(
            "parse_requirements() missing 1 required keyword argument: "
            "'session'"
        )

    skip_match = None
    skip_regex = options.skip_requirements_regex if options else None
    if skip_regex:
        skip_match = re.compile(skip_regex)
    reqs_file_dir = os.path.dirname(os.path.abspath(filename))
    filename, content = get_file_content(
        filename,
        comes_from=comes_from,
        session=session,
    )
    for line_number, line in enumerate(content.splitlines(), 1):
        line = line.strip()

        # Remove comments from file
        line = re.sub(r"(^|\s)#.*$", "", line)

        if not line or line.startswith('#'):
            continue
        if skip_match and skip_match.search(line):
            continue
        if line.startswith('-r') or line.startswith('--requirement'):
            if line.startswith('-r'):
                req_url = line[2:].strip()
            else:
                req_url = line[len('--requirement'):].strip().strip('=')
            if _scheme_re.search(filename):
                # Relative to a URL
                req_url = urlparse.urljoin(filename, req_url)
            elif not _scheme_re.search(req_url):
                req_url = os.path.join(os.path.dirname(filename), req_url)
            for item in parse_requirements(
                    req_url, finder,
                    comes_from=filename,
                    options=options,
                    session=session):
                yield item
        elif line.startswith('-Z') or line.startswith('--always-unzip'):
            # No longer used, but previously these were used in
            # requirement files, so we'll ignore.
            pass
        elif line.startswith('-f') or line.startswith('--find-links'):
            if line.startswith('-f'):
                line = line[2:].strip()
            else:
                line = line[len('--find-links'):].strip().lstrip('=')
            # FIXME: it would be nice to keep track of the source of
            # the find_links:
            # support a find-links local path relative to a requirements file
            relative_to_reqs_file = os.path.join(reqs_file_dir, line)
            if os.path.exists(relative_to_reqs_file):
                line = relative_to_reqs_file
            if finder:
                finder.find_links.append(line)
        elif line.startswith('-i') or line.startswith('--index-url'):
            if line.startswith('-i'):
                line = line[2:].strip()
            else:
                line = line[len('--index-url'):].strip().lstrip('=')
            if finder:
                finder.index_urls = [line]
        elif line.startswith('--extra-index-url'):
            line = line[len('--extra-index-url'):].strip().lstrip('=')
            if finder:
                finder.index_urls.append(line)
        elif line.startswith('--use-wheel'):
            finder.use_wheel = True
        elif line.startswith('--no-index'):
            finder.index_urls = []
        elif line.startswith("--allow-external"):
            line = line[len("--allow-external"):].strip().lstrip("=")
            finder.allow_external |= set([normalize_name(line).lower()])
        elif line.startswith("--allow-all-external"):
            finder.allow_all_external = True
        # Remove in 1.7
        elif line.startswith("--no-allow-external"):
            pass
        # Remove in 1.7
        elif line.startswith("--no-allow-insecure"):
            pass
        # Remove after 1.7
        elif line.startswith("--allow-insecure"):
            line = line[len("--allow-insecure"):].strip().lstrip("=")
            finder.allow_unverified |= set([normalize_name(line).lower()])
        elif line.startswith("--allow-unverified"):
            line = line[len("--allow-unverified"):].strip().lstrip("=")
            finder.allow_unverified |= set([normalize_name(line).lower()])
        else:
            comes_from = '-r %s (line %s)' % (filename, line_number)
            if line.startswith('-e') or line.startswith('--editable'):
                if line.startswith('-e'):
                    line = line[2:].strip()
                else:
                    line = line[len('--editable'):].strip().lstrip('=')
                req = InstallRequirement.from_editable(
                    line,
                    comes_from=comes_from,
                    default_vcs=options.default_vcs if options else None
                )
            else:
                req = InstallRequirement.from_line(
                    line,
                    comes_from,
                    prereleases=getattr(options, "pre", None)
                )
            yield req

Example #5

0

Show file

def parse_requirements(filename,
                       finder=None,
                       comes_from=None,
                       options=None,
                       session=None):
    if session is None:
        raise TypeError(
            "parse_requirements() missing 1 required keyword argument: "
            "'session'")

    skip_match = None
    skip_regex = options.skip_requirements_regex if options else None
    if skip_regex:
        skip_match = re.compile(skip_regex)
    reqs_file_dir = os.path.dirname(os.path.abspath(filename))
    filename, content = get_file_content(
        filename,
        comes_from=comes_from,
        session=session,
    )
    for line_number, line in enumerate(content.splitlines(), 1):
        line = line.strip()

        # Remove comments from file
        line = re.sub(r"(^|\s)#.*$", "", line)

        if not line or line.startswith('#'):
            continue
        if skip_match and skip_match.search(line):
            continue
        if line.startswith('-r') or line.startswith('--requirement'):
            if line.startswith('-r'):
                req_url = line[2:].strip()
            else:
                req_url = line[len('--requirement'):].strip().strip('=')
            if _scheme_re.search(filename):
                # Relative to a URL
                req_url = urlparse.urljoin(filename, req_url)
            elif not _scheme_re.search(req_url):
                req_url = os.path.join(os.path.dirname(filename), req_url)
            for item in parse_requirements(req_url,
                                           finder,
                                           comes_from=filename,
                                           options=options,
                                           session=session):
                yield item
        elif line.startswith('-Z') or line.startswith('--always-unzip'):
            # No longer used, but previously these were used in
            # requirement files, so we'll ignore.
            pass
        elif line.startswith('-f') or line.startswith('--find-links'):
            if line.startswith('-f'):
                line = line[2:].strip()
            else:
                line = line[len('--find-links'):].strip().lstrip('=')
            # FIXME: it would be nice to keep track of the source of
            # the find_links:
            # support a find-links local path relative to a requirements file
            relative_to_reqs_file = os.path.join(reqs_file_dir, line)
            if os.path.exists(relative_to_reqs_file):
                line = relative_to_reqs_file
            if finder:
                finder.find_links.append(line)
        elif line.startswith('-i') or line.startswith('--index-url'):
            if line.startswith('-i'):
                line = line[2:].strip()
            else:
                line = line[len('--index-url'):].strip().lstrip('=')
            if finder:
                finder.index_urls = [line]
        elif line.startswith('--extra-index-url'):
            line = line[len('--extra-index-url'):].strip().lstrip('=')
            if finder:
                finder.index_urls.append(line)
        elif line.startswith('--use-wheel'):
            finder.use_wheel = True
        elif line.startswith('--no-index'):
            finder.index_urls = []
        elif line.startswith("--allow-external"):
            line = line[len("--allow-external"):].strip().lstrip("=")
            finder.allow_external |= set([normalize_name(line).lower()])
        elif line.startswith("--allow-all-external"):
            finder.allow_all_external = True
        # Remove in 1.7
        elif line.startswith("--no-allow-external"):
            pass
        # Remove in 1.7
        elif line.startswith("--no-allow-insecure"):
            pass
        # Remove after 1.7
        elif line.startswith("--allow-insecure"):
            line = line[len("--allow-insecure"):].strip().lstrip("=")
            finder.allow_unverified |= set([normalize_name(line).lower()])
        elif line.startswith("--allow-unverified"):
            line = line[len("--allow-unverified"):].strip().lstrip("=")
            finder.allow_unverified |= set([normalize_name(line).lower()])
        else:
            comes_from = '-r %s (line %s)' % (filename, line_number)
            if line.startswith('-e') or line.startswith('--editable'):
                if line.startswith('-e'):
                    line = line[2:].strip()
                else:
                    line = line[len('--editable'):].strip().lstrip('=')
                req = InstallRequirement.from_editable(
                    line,
                    comes_from=comes_from,
                    default_vcs=options.default_vcs if options else None)
            else:
                req = InstallRequirement.from_line(line,
                                                   comes_from,
                                                   prereleases=getattr(
                                                       options, "pre", None))
            yield req

Example #6

0

Show file

    def get_page(cls, link, req, skip_archives=True, session=None):
        if session is None:
            raise TypeError(
                "get_page() missing 1 required keyword argument: 'session'"
            )

        url = link.url
        url = url.split('#', 1)[0]

        # Check for VCS schemes that do not support lookup as web pages.
        from pip.vcs import VcsSupport
        for scheme in VcsSupport.schemes:
            if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
                logger.debug(
                    'Cannot look at %(scheme)s URL %(link)s' % locals()
                )
                return None

        try:
            if skip_archives:
                filename = link.filename
                for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']:
                    if filename.endswith(bad_ext):
                        content_type = cls._get_content_type(
                            url, session=session,
                        )
                        if content_type.lower().startswith('text/html'):
                            break
                        else:
                            logger.debug(
                                'Skipping page %s because of Content-Type: '
                                '%s' % (link, content_type)
                            )
                            return

            logger.debug('Getting page %s' % url)

            # Tack index.html onto file:// URLs that point to directories
            (scheme, netloc, path, params, query, fragment) = \
                urlparse.urlparse(url)
            if scheme == 'file' and os.path.isdir(url2pathname(path)):
                # add trailing slash if not present so urljoin doesn't trim
                # final segment
                if not url.endswith('/'):
                    url += '/'
                url = urlparse.urljoin(url, 'index.html')
                logger.debug(' file: URL is directory, getting %s' % url)

            resp = session.get(
                url,
                headers={
                    "Accept": "text/html",
                    "Cache-Control": "max-age=600",
                },
            )
            resp.raise_for_status()

            # The check for archives above only works if the url ends with
            #   something that looks like an archive. However that is not a
            #   requirement of an url. Unless we issue a HEAD request on every
            #   url we cannot know ahead of time for sure if something is HTML
            #   or not. However we can check after we've downloaded it.
            content_type = resp.headers.get('Content-Type', 'unknown')
            if not content_type.lower().startswith("text/html"):
                logger.debug(
                    'Skipping page %s because of Content-Type: %s' %
                    (link, content_type)
                )
                return

            inst = cls(resp.text, resp.url, resp.headers, trusted=link.trusted)
        except requests.HTTPError as exc:
            level = 2 if exc.response.status_code == 404 else 1
            cls._handle_fail(req, link, exc, url, level=level)
        except requests.ConnectionError as exc:
            cls._handle_fail(
                req, link, "connection error: %s" % exc, url,
            )
        except requests.Timeout:
            cls._handle_fail(req, link, "timed out", url)
        except SSLError as exc:
            reason = ("There was a problem confirming the ssl certificate: "
                      "%s" % exc)
            cls._handle_fail(
                req, link, reason, url,
                level=2,
                meth=logger.notify,
            )
        else:
            return inst