def scraped_rel_links(self): for regex in (self._homepage_re, self._download_re): match = regex.search(self.content) if not match: continue href_match = self._href_re.search(self.content, pos=match.end()) if not href_match: continue url = href_match.group(1) or href_match.group(2) or href_match.group(3) if not url: continue url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self)
def explicit_rel_links(self, rels=('homepage', 'download')): """Yields all links with the given relations""" rels = set(rels) for anchor in self.parsed.findall(".//a"): if anchor.get("rel") and anchor.get("href"): found_rels = set(anchor.get("rel").split()) # Determine the intersection between what rels were found and # what rels were being looked for if found_rels & rels: href = anchor.get("href") url = self.clean_link(urlparse.urljoin(self.base_url, href)) yield Link(url, self, trusted=False)
def scraped_rel_links(self): for regex in (self._homepage_re, self._download_re): match = regex.search(self.content) if not match: continue href_match = self._href_re.search(self.content, pos=match.end()) if not href_match: continue url = href_match.group(1) or href_match.group(2) or href_match.group(3) if not url: continue url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self)
def explicit_rel_links(self, rels=('homepage', 'download')): """Yields all links with the given relations""" rels = set(rels) for anchor in self.parsed.findall(".//a"): if anchor.get("rel") and anchor.get("href"): found_rels = set(anchor.get("rel").split()) # Determine the intersection between what rels were found and # what rels were being looked for if found_rels & rels: href = anchor.get("href") url = self.clean_link(urlparse.urljoin(self.base_url, href)) yield Link(url, self, trusted=False)
def scraped_rel_links(self): # Can we get rid of this horrible horrible method? for regex in (self._homepage_re, self._download_re): match = regex.search(self.content) if not match: continue href_match = self._href_re.search(self.content, pos=match.end()) if not href_match: continue url = href_match.group(1) or href_match.group(2) or href_match.group(3) if not url: continue url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self, trusted=False, _deprecated_regex=True)
def scraped_rel_links(self): # Can we get rid of this horrible horrible method? for regex in (self._homepage_re, self._download_re): match = regex.search(self.content) if not match: continue href_match = self._href_re.search(self.content, pos=match.end()) if not href_match: continue url = href_match.group(1) or href_match.group(2) or href_match.group(3) if not url: continue url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self, trusted=False, _deprecated_regex=True)
def explicit_rel_links(self, rels=('homepage', 'download')): """Yields all links with the given relations""" for match in self._rel_re.finditer(self.content): found_rels = match.group(1).lower().split() for rel in rels: if rel in found_rels: break else: continue match = self._href_re.search(match.group(0)) if not match: continue url = match.group(1) or match.group(2) or match.group(3) url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self)
def explicit_rel_links(self, rels=('homepage', 'download')): """Yields all links with the given relations""" for match in self._rel_re.finditer(self.content): found_rels = match.group(1).lower().split() for rel in rels: if rel in found_rels: break else: continue match = self._href_re.search(match.group(0)) if not match: continue url = match.group(1) or match.group(2) or match.group(3) url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self)
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link(urlparse.urljoin(self.base_url, href)) # Determine if this link is internal. If that distinction # doesn't make sense in this context, then we don't make # any distinction. internal = None if self.api_version and self.api_version >= 2: # Only api_versions >= 2 have a distinction between # external and internal links internal = bool(anchor.get("rel") and "internal" in anchor.get("rel").split()) yield Link(url, self, internal=internal)
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link(urlparse.urljoin(self.base_url, href)) # Determine if this link is internal. If that distinction # doesn't make sense in this context, then we don't make # any distinction. internal = None if self.api_version and self.api_version >= 2: # Only api_versions >= 2 have a distinction between # external and internal links internal = bool(anchor.get("rel") and "internal" in anchor.get("rel").split()) yield Link(url, self, internal=internal)
def parse_requirements(filename, finder=None, comes_from=None, options=None, session=None): if session is None: session = PipSession() skip_match = None skip_regex = options.skip_requirements_regex if options else None if skip_regex: skip_match = re.compile(skip_regex) reqs_file_dir = os.path.dirname(os.path.abspath(filename)) filename, content = get_file_content( filename, comes_from=comes_from, session=session, ) for line_number, line in enumerate(content.splitlines()): line_number += 1 line = line.strip() # Remove comments from file line = re.sub(r"(^|\s)#.*$", "", line) if not line or line.startswith('#'): continue if skip_match and skip_match.search(line): continue if line.startswith('-r') or line.startswith('--requirement'): if line.startswith('-r'): req_url = line[2:].strip() else: req_url = line[len('--requirement'):].strip().strip('=') if _scheme_re.search(filename): # Relative to a URL req_url = urlparse.urljoin(filename, req_url) elif not _scheme_re.search(req_url): req_url = os.path.join(os.path.dirname(filename), req_url) for item in parse_requirements(req_url, finder, comes_from=filename, options=options, session=session): yield item elif line.startswith('-Z') or line.startswith('--always-unzip'): # No longer used, but previously these were used in # requirement files, so we'll ignore. pass elif line.startswith('-f') or line.startswith('--find-links'): if line.startswith('-f'): line = line[2:].strip() else: line = line[len('--find-links'):].strip().lstrip('=') # FIXME: it would be nice to keep track of the source of # the find_links: # support a find-links local path relative to a requirements file relative_to_reqs_file = os.path.join(reqs_file_dir, line) if os.path.exists(relative_to_reqs_file): line = relative_to_reqs_file if finder: finder.find_links.append(line) elif line.startswith('-i') or line.startswith('--index-url'): if line.startswith('-i'): line = line[2:].strip() else: line = line[len('--index-url'):].strip().lstrip('=') if finder: finder.index_urls = [line] elif line.startswith('--extra-index-url'): line = line[len('--extra-index-url'):].strip().lstrip('=') if finder: finder.index_urls.append(line) elif line.startswith('--use-wheel'): finder.use_wheel = True elif line.startswith('--no-index'): finder.index_urls = [] elif line.startswith("--allow-external"): line = line[len("--allow-external"):].strip().lstrip("=") finder.allow_external |= set([normalize_name(line).lower()]) elif line.startswith("--allow-all-external"): finder.allow_all_external = True # Remove in 1.7 elif line.startswith("--no-allow-external"): pass # Remove in 1.7 elif line.startswith("--no-allow-insecure"): pass # Remove after 1.7 elif line.startswith("--allow-insecure"): line = line[len("--allow-insecure"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) elif line.startswith("--allow-unverified"): line = line[len("--allow-unverified"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) else: comes_from = '-r %s (line %s)' % (filename, line_number) if line.startswith('-e') or line.startswith('--editable'): if line.startswith('-e'): line = line[2:].strip() else: line = line[len('--editable'):].strip().lstrip('=') req = InstallRequirement.from_editable( line, comes_from=comes_from, default_vcs=options.default_vcs if options else None) else: req = InstallRequirement.from_line(line, comes_from, prereleases=getattr( options, "pre", None)) yield req
def links(self): """Yields all links in the page""" for match in self._href_re.finditer(self.content): url = match.group(1) or match.group(2) or match.group(3) url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self)
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) inst = cls(u(contents), real_url, headers) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level = 1 desc = 'timed out' elif isinstance(e, URLError): log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth( 'Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True, session=None): if session is None: session = PipSession() url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = session.get(url, headers={"Accept": "text/html"}) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz # Unless we issue a HEAD request on every url we cannot know # ahead of time for sure if something is HTML or not. However we # can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None inst = cls(resp.text, resp.url, resp.headers, trusted=link.trusted) except requests.HTTPError as exc: level = 2 if exc.response.status_code == 404 else 1 cls._handle_fail(req, link, exc, url, cache=cache, level=level) except requests.ConnectionError as exc: cls._handle_fail( req, link, "connection error: %s" % exc, url, cache=cache, ) except requests.Timeout: cls._handle_fail(req, link, "timed out", url, cache=cache) except SSLError as exc: reason = ("There was a problem confirming the ssl certificate: " "%s" % exc) cls._handle_fail( req, link, reason, url, cache=cache, level=2, meth=logger.notify, ) except requests.TooManyRedirects as exc: cls._handle_fail( req, link, "Error: %s" % exc, url, cache=cache, ) else: if cache is not None: cache.add_page([url, resp.url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True, session=None): if session is None: session = PipSession() url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug( 'Cannot look at %(scheme)s URL %(link)s' % locals() ) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: ' '%s' % (link, content_type) ) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = session.get(url, headers={"Accept": "text/html"}) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type) ) if cache is not None: cache.set_is_archive(url) return None inst = cls(resp.text, resp.url, resp.headers, trusted=link.trusted) except requests.HTTPError as exc: level = 2 if exc.response.status_code == 404 else 1 cls._handle_fail(req, link, exc, url, cache=cache, level=level) except requests.ConnectionError as exc: cls._handle_fail( req, link, "connection error: %s" % exc, url, cache=cache, ) except requests.Timeout: cls._handle_fail(req, link, "timed out", url, cache=cache) except SSLError as exc: reason = ("There was a problem confirming the ssl certificate: " "%s" % exc) cls._handle_fail( req, link, reason, url, cache=cache, level=2, meth=logger.notify, ) else: if cache is not None: cache.add_page([url, resp.url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True, session=None): if session is None: session = PipSession() url = link.url url = url.split("#", 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in "+:": logger.debug("Cannot look at %(scheme)s URL %(link)s" % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in [".tar", ".tar.gz", ".tar.bz2", ".tgz", ".zip"]: if filename.endswith(bad_ext): content_type = cls._get_content_type(url, session=session) if content_type.lower().startswith("text/html"): break else: logger.debug("Skipping page %s because of Content-Type: %s" % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug("Getting page %s" % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == "file" and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith("/"): url += "/" url = urlparse.urljoin(url, "index.html") logger.debug(" file: URL is directory, getting %s" % url) resp = session.get(url) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz # Unless we issue a HEAD request on every url we cannot know # ahead of time for sure if something is HTML or not. However we # can check after we've downloaded it. content_type = resp.headers.get("Content-Type", "unknown") if not content_type.lower().startswith("text/html"): logger.debug("Skipping page %s because of Content-Type: %s" % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None inst = cls(resp.text, resp.url, resp.headers, trusted=link.trusted) except requests.HTTPError as exc: level = 2 if exc.response.status_code == 404 else 1 cls._handle_fail(req, link, exc, url, cache=cache, level=level) except requests.Timeout: cls._handle_fail(req, link, "timed out", url, cache=cache) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " "%s" % exc cls._handle_fail(req, link, reason, url, cache=cache, level=2, meth=logger.notify) else: if cache is not None: cache.add_page([url, resp.url], inst) return inst
def links(self): """Yields all links in the page""" for match in self._href_re.finditer(self.content): url = match.group(1) or match.group(2) or match.group(3) url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self)
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) inst = cls(u(contents), real_url, headers) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level =1 desc = 'timed out' elif isinstance(e, URLError): log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth('Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz # Unless we issue a HEAD request on every url we cannot know # ahead of time for sure if something is HTML or not. However we # can check after we've downloaded it. content_type = headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None inst = cls(u(contents), real_url, headers, trusted=link.trusted) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level =1 desc = 'timed out' elif isinstance(e, URLError): #ssl/certificate error if hasattr(e, 'reason') and (isinstance(e.reason, ssl.SSLError) or isinstance(e.reason, CertificateError)): desc = 'There was a problem confirming the ssl certificate: %s' % e log_meth = logger.notify else: log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth('Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack list.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'list.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz # Unless we issue a HEAD request on every url we cannot know # ahead of time for sure if something is HTML or not. However we # can check after we've downloaded it. content_type = headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None inst = cls(u(contents), real_url, headers, trusted=link.trusted) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level = 1 desc = 'timed out' elif isinstance(e, URLError): #ssl/certificate error if hasattr(e, 'reason') and ( isinstance(e.reason, ssl.SSLError) or isinstance(e.reason, CertificateError)): desc = 'There was a problem confirming the ssl certificate: %s' % e log_meth = logger.notify else: log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth( 'Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def parse_requirements(filename, finder=None, comes_from=None, options=None, session=None): if session is None: session = PipSession() skip_match = None skip_regex = options.skip_requirements_regex if options else None if skip_regex: skip_match = re.compile(skip_regex) reqs_file_dir = os.path.dirname(os.path.abspath(filename)) filename, content = get_file_content( filename, comes_from=comes_from, session=session, ) for line_number, line in enumerate(content.splitlines()): line_number += 1 line = line.strip() # Remove comments from file line = re.sub(r"(^|\s)#.*$", "", line) if not line or line.startswith('#'): continue if skip_match and skip_match.search(line): continue if line.startswith('-r') or line.startswith('--requirement'): if line.startswith('-r'): req_url = line[2:].strip() else: req_url = line[len('--requirement'):].strip().strip('=') if _scheme_re.search(filename): # Relative to a URL req_url = urlparse.urljoin(filename, req_url) elif not _scheme_re.search(req_url): req_url = os.path.join(os.path.dirname(filename), req_url) for item in parse_requirements( req_url, finder, comes_from=filename, options=options, session=session): yield item elif line.startswith('-Z') or line.startswith('--always-unzip'): # No longer used, but previously these were used in # requirement files, so we'll ignore. pass elif line.startswith('-f') or line.startswith('--find-links'): if line.startswith('-f'): line = line[2:].strip() else: line = line[len('--find-links'):].strip().lstrip('=') ## FIXME: it would be nice to keep track of the source of ## the find_links: # support a find-links local path relative to a requirements file relative_to_reqs_file = os.path.join(reqs_file_dir, line) if os.path.exists(relative_to_reqs_file): line = relative_to_reqs_file if finder: finder.find_links.append(line) elif line.startswith('-i') or line.startswith('--index-url'): if line.startswith('-i'): line = line[2:].strip() else: line = line[len('--index-url'):].strip().lstrip('=') if finder: finder.index_urls = [line] elif line.startswith('--extra-index-url'): line = line[len('--extra-index-url'):].strip().lstrip('=') if finder: finder.index_urls.append(line) elif line.startswith('--use-wheel'): finder.use_wheel = True elif line.startswith('--no-index'): finder.index_urls = [] elif line.startswith("--allow-external"): line = line[len("--allow-external"):].strip().lstrip("=") finder.allow_external |= set([normalize_name(line).lower()]) elif line.startswith("--allow-all-external"): finder.allow_all_external = True # Remove in 1.7 elif line.startswith("--no-allow-external"): pass # Remove in 1.7 elif line.startswith("--no-allow-insecure"): pass # Remove after 1.7 elif line.startswith("--allow-insecure"): line = line[len("--allow-insecure"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) elif line.startswith("--allow-unverified"): line = line[len("--allow-unverified"):].strip().lstrip("=") finder.allow_unverified |= set([normalize_name(line).lower()]) else: comes_from = '-r %s (line %s)' % (filename, line_number) if line.startswith('-e') or line.startswith('--editable'): if line.startswith('-e'): line = line[2:].strip() else: line = line[len('--editable'):].strip().lstrip('=') req = InstallRequirement.from_editable( line, comes_from=comes_from, default_vcs=options.default_vcs if options else None ) else: req = InstallRequirement.from_line( line, comes_from, prereleases=getattr(options, "pre", None) ) yield req