def explicit_rel_links(self, rels=('homepage', 'download')): """Yields all links with the given relations""" rels = set(rels) for anchor in self.parsed.findall(".//a"): if anchor.get("rel") and anchor.get("href"): found_rels = set(anchor.get("rel").split()) # Determine the intersection between what rels were found and # what rels were being looked for if found_rels & rels: href = anchor.get("href") url = self.clean_link( urlparse.urljoin(self.base_url, href) ) yield Link(url, self, trusted=False)
def scraped_rel_links(self): # Can we get rid of this horrible horrible method? for regex in (self._homepage_re, self._download_re): match = regex.search(self.content) if not match: continue href_match = self._href_re.search(self.content, pos=match.end()) if not href_match: continue url = ( href_match.group(1) or href_match.group(2) or href_match.group(3) ) if not url: continue url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self, trusted=False, _deprecated_regex=True)
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link(urlparse.urljoin(self.base_url, href)) # Determine if this link is internal. If that distinction # doesn't make sense in this context, then we don't make # any distinction. internal = None if self.api_version and self.api_version >= 2: # Only api_versions >= 2 have a distinction between # external and internal links internal = bool( anchor.get("rel") and "internal" in anchor.get("rel").split() ) yield Link(url, self, internal=internal)
def parse_requirements(filename, finder=None, comes_from=None, options=None, session=None): if session is None: raise TypeError( "parse_requirements() missing 1 required keyword argument: " "'session'" ) skip_match = None skip_regex = options.skip_requirements_regex if options else None if skip_regex: skip_match = re.compile(skip_regex) reqs_file_dir = os.path.dirname(os.path.abspath(filename)) filename, content = get_file_content( filename, comes_from=comes_from, session=session, ) for line_number, line in enumerate(content.splitlines(), 1): line = line.strip() # Remove comments from file line = re.sub(r"(^|\s)#.*$", "", line) if not line or line.startswith('#'): continue if skip_match and skip_match.search(line): continue if line.startswith('-r') or line.startswith('--requirement'): if line.startswith('-r'): req_url = line[2:].strip() else: req_url = line[len('--requirement'):].strip().strip('=') if _scheme_re.search(filename): # Relative to a URL req_url = urlparse.urljoin(filename, req_url) elif not _scheme_re.search(req_url): req_url = os.path.join(os.path.dirname(filename), req_url) for item in parse_requirements( req_url, finder, comes_from=filename, options=options, session=session): yield item elif line.startswith('-Z') or line.startswith('--always-unzip'): # No longer used, but previously these were used in # requirement files, so we'll ignore. pass elif line.startswith('-f') or line.startswith('--find-links'): if line.startswith('-f'): line = line[2:].strip() else: line = line[len('--find-links'):].strip().lstrip('=') # FIXME: it would be nice to keep track of the source of # the find_links: # support a find-links local path relative to a requirements file relative_to_reqs_file = os.path.join(reqs_file_dir, line) if os.path.exists(relative_to_reqs_file): line = relative_to_reqs_file if finder: finder.find_links.append(line) elif line.startswith('-i') or line.startswith('--index-url'): if line.startswith('-i'): line = line[2:].strip() else: line = line[len('--index-url'):].strip().lstrip('=') if finder: finder.index_urls = [line] elif line.startswith('--extra-index-url'): line = line[len('--extra-index-url'):].strip().lstrip('=') if finder: finder.index_urls.append(line) elif line.startswith('--use-wheel'): finder.use_wheel = True elif line.startswith('--no-index'): finder.index_urls = [] elif line.startswith("--allow-external"): line = line[len("--allow-external"):].strip().lstrip("=") finder.allow_external |= set([normalize_name(line).lower()]) elif line.startswith("--allow-all-external"): finder.allow_all_external = True # Remove in 1.7 elif line.startswith("--no-allow-external"): pass # Remove in 1.7 elif line.startswith("--no-allow-insecure"): pass # Remove after 1.7 elif line.startswith("--allow-insecure"): line = line[len("--allow-insecure"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) elif line.startswith("--allow-unverified"): line = line[len("--allow-unverified"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) else: comes_from = '-r %s (line %s)' % (filename, line_number) if line.startswith('-e') or line.startswith('--editable'): if line.startswith('-e'): line = line[2:].strip() else: line = line[len('--editable'):].strip().lstrip('=') req = InstallRequirement.from_editable( line, comes_from=comes_from, default_vcs=options.default_vcs if options else None ) else: req = InstallRequirement.from_line( line, comes_from, prereleases=getattr(options, "pre", None) ) yield req
def parse_requirements(filename, finder=None, comes_from=None, options=None, session=None): if session is None: raise TypeError( "parse_requirements() missing 1 required keyword argument: " "'session'") skip_match = None skip_regex = options.skip_requirements_regex if options else None if skip_regex: skip_match = re.compile(skip_regex) reqs_file_dir = os.path.dirname(os.path.abspath(filename)) filename, content = get_file_content( filename, comes_from=comes_from, session=session, ) for line_number, line in enumerate(content.splitlines(), 1): line = line.strip() # Remove comments from file line = re.sub(r"(^|\s)#.*$", "", line) if not line or line.startswith('#'): continue if skip_match and skip_match.search(line): continue if line.startswith('-r') or line.startswith('--requirement'): if line.startswith('-r'): req_url = line[2:].strip() else: req_url = line[len('--requirement'):].strip().strip('=') if _scheme_re.search(filename): # Relative to a URL req_url = urlparse.urljoin(filename, req_url) elif not _scheme_re.search(req_url): req_url = os.path.join(os.path.dirname(filename), req_url) for item in parse_requirements(req_url, finder, comes_from=filename, options=options, session=session): yield item elif line.startswith('-Z') or line.startswith('--always-unzip'): # No longer used, but previously these were used in # requirement files, so we'll ignore. pass elif line.startswith('-f') or line.startswith('--find-links'): if line.startswith('-f'): line = line[2:].strip() else: line = line[len('--find-links'):].strip().lstrip('=') # FIXME: it would be nice to keep track of the source of # the find_links: # support a find-links local path relative to a requirements file relative_to_reqs_file = os.path.join(reqs_file_dir, line) if os.path.exists(relative_to_reqs_file): line = relative_to_reqs_file if finder: finder.find_links.append(line) elif line.startswith('-i') or line.startswith('--index-url'): if line.startswith('-i'): line = line[2:].strip() else: line = line[len('--index-url'):].strip().lstrip('=') if finder: finder.index_urls = [line] elif line.startswith('--extra-index-url'): line = line[len('--extra-index-url'):].strip().lstrip('=') if finder: finder.index_urls.append(line) elif line.startswith('--use-wheel'): finder.use_wheel = True elif line.startswith('--no-index'): finder.index_urls = [] elif line.startswith("--allow-external"): line = line[len("--allow-external"):].strip().lstrip("=") finder.allow_external |= set([normalize_name(line).lower()]) elif line.startswith("--allow-all-external"): finder.allow_all_external = True # Remove in 1.7 elif line.startswith("--no-allow-external"): pass # Remove in 1.7 elif line.startswith("--no-allow-insecure"): pass # Remove after 1.7 elif line.startswith("--allow-insecure"): line = line[len("--allow-insecure"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) elif line.startswith("--allow-unverified"): line = line[len("--allow-unverified"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) else: comes_from = '-r %s (line %s)' % (filename, line_number) if line.startswith('-e') or line.startswith('--editable'): if line.startswith('-e'): line = line[2:].strip() else: line = line[len('--editable'):].strip().lstrip('=') req = InstallRequirement.from_editable( line, comes_from=comes_from, default_vcs=options.default_vcs if options else None) else: req = InstallRequirement.from_line(line, comes_from, prereleases=getattr( options, "pre", None)) yield req
def get_page(cls, link, req, skip_archives=True, session=None): if session is None: raise TypeError( "get_page() missing 1 required keyword argument: 'session'" ) url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug( 'Cannot look at %(scheme)s URL %(link)s' % locals() ) return None try: if skip_archives: filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: ' '%s' % (link, content_type) ) return logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = session.get( url, headers={ "Accept": "text/html", "Cache-Control": "max-age=600", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type) ) return inst = cls(resp.text, resp.url, resp.headers, trusted=link.trusted) except requests.HTTPError as exc: level = 2 if exc.response.status_code == 404 else 1 cls._handle_fail(req, link, exc, url, level=level) except requests.ConnectionError as exc: cls._handle_fail( req, link, "connection error: %s" % exc, url, ) except requests.Timeout: cls._handle_fail(req, link, "timed out", url) except SSLError as exc: reason = ("There was a problem confirming the ssl certificate: " "%s" % exc) cls._handle_fail( req, link, reason, url, level=2, meth=logger.notify, ) else: return inst