def _set_all_fields(self, folder): filename = os.path.join(folder, urllib.quote(self.url, '')) if not os.path.exists(filename): self._cache_url(filename) fp = open(filename, 'rb') try: line = fp.readline().strip() self.code, self.msg = line.split(None, 1) except ValueError: raise ValueError('Bad field line: %r' % line) self.code = int(self.code) self.msg = u(self.msg) for line in fp: if line == b('\n'): break key, value = line.split(b(': '), 1) self.headers[u(key)] = u(value.strip()) for line in fp: self._body += line fp.close()
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) inst = cls(u(contents), real_url, headers) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level = 1 desc = 'timed out' elif isinstance(e, URLError): log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth( 'Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) inst = cls(u(contents), real_url, headers) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level =1 desc = 'timed out' elif isinstance(e, URLError): log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth('Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz # Unless we issue a HEAD request on every url we cannot know # ahead of time for sure if something is HTML or not. However we # can check after we've downloaded it. content_type = headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None inst = cls(u(contents), real_url, headers, trusted=link.trusted) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level =1 desc = 'timed out' elif isinstance(e, URLError): #ssl/certificate error if hasattr(e, 'reason') and (isinstance(e.reason, ssl.SSLError) or isinstance(e.reason, CertificateError)): desc = 'There was a problem confirming the ssl certificate: %s' % e log_meth = logger.notify else: log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth('Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def get_page(cls, link, req, cache=None, skip_archives=True): url = link.url url = url.split('#', 1)[0] if cache.too_many_failures(url): return None # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) return None if cache is not None: inst = cache.get_page(url) if inst is not None: return inst try: if skip_archives: if cache is not None: if cache.is_archive(url): return None filename = link.filename for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']: if filename.endswith(bad_ext): content_type = cls._get_content_type(url) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None logger.debug('Getting page %s' % url) # Tack list.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if scheme == 'file' and os.path.isdir(url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'list.html') logger.debug(' file: URL is directory, getting %s' % url) resp = urlopen(url) real_url = geturl(resp) headers = resp.info() contents = resp.read() encoding = headers.get('Content-Encoding', None) #XXX need to handle exceptions and add testing for this if encoding is not None: if encoding == 'gzip': contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz # Unless we issue a HEAD request on every url we cannot know # ahead of time for sure if something is HTML or not. However we # can check after we've downloaded it. content_type = headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug('Skipping page %s because of Content-Type: %s' % (link, content_type)) if cache is not None: cache.set_is_archive(url) return None inst = cls(u(contents), real_url, headers, trusted=link.trusted) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1] desc = str(e) if isinstance(e, socket.timeout): log_meth = logger.info level = 1 desc = 'timed out' elif isinstance(e, URLError): #ssl/certificate error if hasattr(e, 'reason') and ( isinstance(e.reason, ssl.SSLError) or isinstance(e.reason, CertificateError)): desc = 'There was a problem confirming the ssl certificate: %s' % e log_meth = logger.notify else: log_meth = logger.info if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): desc = 'timed out' level = 1 else: level = 2 elif isinstance(e, HTTPError) and e.code == 404: ## FIXME: notify? log_meth = logger.info level = 2 else: log_meth = logger.info level = 1 log_meth('Could not fetch URL %s: %s' % (link, desc)) log_meth( 'Will skip URL %s when looking for download links for %s' % (link.url, req)) if cache is not None: cache.add_page_failure(url, level) return None if cache is not None: cache.add_page([url, real_url], inst) return inst
def call_subprocess(cmd, show_stdout=True, filter_stdout=None, cwd=None, raise_on_returncode=True, command_level=logger.DEBUG, command_desc=None, extra_environ=None): if command_desc is None: cmd_parts = [] for part in cmd: if ' ' in part or '\n' in part or '"' in part or "'" in part: part = '"%s"' % part.replace('"', '\\"') cmd_parts.append(part) command_desc = ' '.join(cmd_parts) if show_stdout: stdout = None else: stdout = subprocess.PIPE logger.log(command_level, "Running command %s" % command_desc) env = os.environ.copy() if extra_environ: env.update(extra_environ) try: proc = subprocess.Popen( cmd, stderr=subprocess.STDOUT, stdin=None, stdout=stdout, cwd=cwd, env=env) except Exception: e = sys.exc_info()[1] logger.fatal( "Error %s while executing command %s" % (e, command_desc)) raise all_output = [] if stdout is not None: stdout = proc.stdout while 1: line = u(stdout.readline()) if not line: break line = line.rstrip() all_output.append(line + '\n') if filter_stdout: level = filter_stdout(line) if isinstance(level, tuple): level, line = level logger.log(level, line) if not logger.stdout_level_matches(level): logger.show_progress() else: logger.info(line) else: returned_stdout, returned_stderr = proc.communicate() all_output = [returned_stdout or ''] proc.wait() if proc.returncode: if raise_on_returncode: if all_output: logger.notify('Complete output from command %s:' % command_desc) logger.notify('\n'.join(all_output) + '\n----------------------------------------') raise InstallationError( "Command %s failed with error code %s" % (command_desc, proc.returncode)) else: logger.warn( "Command %s had error code %s" % (command_desc, proc.returncode)) if stdout is not None: return ''.join(all_output)
def call_subprocess(cmd, show_stdout=True, filter_stdout=None, cwd=None, raise_on_returncode=True, command_level=logger.DEBUG, command_desc=None, extra_environ=None): if command_desc is None: cmd_parts = [] for part in cmd: if ' ' in part or '\n' in part or '"' in part or "'" in part: part = '"%s"' % part.replace('"', '\\"') cmd_parts.append(part) command_desc = ' '.join(cmd_parts) if show_stdout: stdout = None else: stdout = subprocess.PIPE logger.log(command_level, "Running command %s" % command_desc) env = os.environ.copy() if extra_environ: env.update(extra_environ) try: proc = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdin=None, stdout=stdout, cwd=cwd, env=env) except Exception: e = sys.exc_info()[1] logger.fatal("Error %s while executing command %s" % (e, command_desc)) raise all_output = [] if stdout is not None: stdout = proc.stdout while 1: line = u(stdout.readline()) if not line: break line = line.rstrip() all_output.append(line + '\n') if filter_stdout: level = filter_stdout(line) if isinstance(level, tuple): level, line = level logger.log(level, line) if not logger.stdout_level_matches(level): logger.show_progress() else: logger.info(line) else: returned_stdout, returned_stderr = proc.communicate() all_output = [returned_stdout or ''] proc.wait() if proc.returncode: if raise_on_returncode: if all_output: logger.notify('Complete output from command %s:' % command_desc) logger.notify('\n'.join(all_output) + '\n----------------------------------------') raise InstallationError("Command %s failed with error code %s" % (command_desc, proc.returncode)) else: logger.warn("Command %s had error code %s" % (command_desc, proc.returncode)) if stdout is not None: return ''.join(all_output)