def _download_to(self, url, filename): self.info("Downloading %s", url) # Download the file fp, info = None, None try: checker = HashChecker.from_url(url) fp = self.open_url(strip_fragment(url)) if isinstance(fp, HTTPError): raise DistutilsError( "Can't download %s: %s %s" % (url, fp.code,fp.msg) ) headers = fp.info() blocknum = 0 bs = self.dl_blocksize size = -1 if "content-length" in headers: # Some servers return multiple Content-Length headers :( sizes = get_all_headers(headers, 'Content-Length') size = max(map(int, sizes)) self.reporthook(url, filename, blocknum, bs, size) with open(filename,'wb') as tfp: while True: block = fp.read(bs) if block: checker.feed(block) tfp.write(block) blocknum += 1 self.reporthook(url, filename, blocknum, bs, size) else: break self.check_hash(checker, filename, tfp) return headers finally: if fp: fp.close()
def creds_by_repository(self): sections_with_repositories = [ section for section in self.sections() if self.get(section, 'repository').strip() ] return dict(map(self._get_repo_cred, sections_with_repositories))
def scan(link): # Process a URL to see if it's for a package page if link.startswith(self.index_url): parts = list(map( unquote, link[len(self.index_url):].split('/') )) if len(parts)==2 and '#' not in parts[1]: # it's a package page, sanitize and index it pkg = safe_name(parts[0]) ver = safe_version(parts[1]) self.package_pages.setdefault(pkg.lower(),{})[link] = True return to_filename(pkg), to_filename(ver) return None, None
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if os.getenv("CONDA_BUILD"): raise RuntimeError("Setuptools downloading is disabled in conda build. " "Be sure to add all dependencies in the meta.yaml url=%sr" % url) if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: list(map(self.add, dists)) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) self.fetched_urls[url] = True # prevent multiple fetch attempts f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url) if f is None: return self.fetched_urls[f.url] = True if 'html' not in f.headers.get('content-type', '').lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() if not isinstance(page, str): # We are in Python 3 and got bytes. We want str. if isinstance(f, HTTPError): # Errors have no charset, assume latin1: charset = 'latin-1' else: charset = f.headers.get_param('charset') or 'latin-1' page = page.decode(charset, "ignore") f.close() for match in HREF.finditer(page): link = urljoin(base, htmldecode(match.group(1))) self.process_url(link) if url.startswith(self.index_url) and getattr(f,'code',None)!=404: page = self.process_index(url, page)
def __init__( self, index_url="https://pypi.python.org/simple", hosts=('*',), ca_bundle=None, verify_ssl=True, *args, **kw ): Environment.__init__(self,*args,**kw) self.index_url = index_url + "/"[:not index_url.endswith('/')] self.scanned_urls = {} self.fetched_urls = {} self.package_pages = {} self.allows = re.compile('|'.join(map(translate,hosts))).match self.to_scan = [] if verify_ssl and ssl_support.is_available and (ca_bundle or ssl_support.find_ca_bundle()): self.opener = ssl_support.opener_for(ca_bundle) else: self.opener = urllib2.urlopen
def scan_egg_link(self, path, entry): with open(os.path.join(path, entry)) as raw_lines: # filter non-empty lines lines = list(filter(None, map(str.strip, raw_lines))) if len(lines) != 2: # format is not recognized; punt return egg_path, setup_path = lines for dist in find_distributions(os.path.join(path, egg_path)): dist.location = os.path.join(path, *lines) dist.precedence = SOURCE_DIST self.add(dist)
def process_filename(self, fn, nested=False): # process filenames or directories if not os.path.exists(fn): self.warn("Not found: %s", fn) return if os.path.isdir(fn) and not nested: path = os.path.realpath(fn) for item in os.listdir(path): self.process_filename(os.path.join(path,item), True) dists = distros_for_filename(fn) if dists: self.debug("Found: %s", fn) list(map(self.add, dists))
def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" for match in REL.finditer(page): tag, rel = match.groups() rels = set(map(str.strip, rel.lower().split(','))) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): yield urljoin(url, htmldecode(match.group(1))) for tag in ("<th>Home Page", "<th>Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: yield urljoin(url, htmldecode(match.group(1)))
def prescan(self): """Scan urls scheduled for prescanning (e.g. --find-links)""" if self.to_scan: list(map(self.scan_url, self.to_scan)) self.to_scan = None # from now on, go ahead and process immediately