def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" for match in REL.finditer(page): tag, rel = match.groups() rels = set(map(str.strip, rel.lower().split(','))) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): yield urljoin(url, htmldecode(match.group(1))) for tag in ("<th>Home Page", "<th>Download URL"): pos = page.find(tag) if pos != -1: match = HREF.search(page, pos) if match: yield urljoin(url, htmldecode(match.group(1)))
def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" for match in REL.finditer(page): tag, rel = match.groups() rels = set(map(str.strip, rel.lower().split(','))) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): yield urljoin(url, htmldecode(match.group(1))) for tag in ("<th>Home Page", "<th>Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: yield urljoin(url, htmldecode(match.group(1)))
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if os.getenv("CONDA_BUILD"): raise RuntimeError( "Setuptools downloading is disabled in conda build. " "Be sure to add all dependencies in the meta.yaml url=%sr" % url) if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: list(map(self.add, dists)) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) self.fetched_urls[url] = True # prevent multiple fetch attempts f = self.open_url( url, "Download error on %s: %%s -- Some packages may not be found!" % url) if f is None: return self.fetched_urls[f.url] = True if 'html' not in f.headers.get('content-type', '').lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() if not isinstance( page, str): # We are in Python 3 and got bytes. We want str. if isinstance(f, HTTPError): # Errors have no charset, assume latin1: charset = 'latin-1' else: charset = f.headers.get_param('charset') or 'latin-1' page = page.decode(charset, "ignore") f.close() for match in HREF.finditer(page): link = urljoin(base, htmldecode(match.group(1))) self.process_url(link) if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: page = self.process_index(url, page)
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if os.getenv("CONDA_BUILD"): raise RuntimeError("Setuptools downloading is disabled in conda build. " "Be sure to add all dependencies in the meta.yaml url=%sr" % url) if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: list(map(self.add, dists)) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) self.fetched_urls[url] = True # prevent multiple fetch attempts f = self.open_url(url, "Download error on %s: %%s -- Some packages may not be found!" % url) if f is None: return self.fetched_urls[f.url] = True if 'html' not in f.headers.get('content-type', '').lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() if not isinstance(page, str): # We are in Python 3 and got bytes. We want str. if isinstance(f, HTTPError): # Errors have no charset, assume latin1: charset = 'latin-1' else: charset = f.headers.get_param('charset') or 'latin-1' page = page.decode(charset, "ignore") f.close() for match in HREF.finditer(page): link = urljoin(base, htmldecode(match.group(1))) self.process_url(link) if url.startswith(self.index_url) and getattr(f,'code',None)!=404: page = self.process_index(url, page)
def process_index(self, url, page): """Process the contents of a PyPI page""" def scan(link): # Process a URL to see if it's for a package page if link.startswith(self.index_url): parts = list(map( unquote, link[len(self.index_url):].split('/') )) if len(parts) == 2 and '#' not in parts[1]: # it's a package page, sanitize and index it pkg = safe_name(parts[0]) ver = safe_version(parts[1]) self.package_pages.setdefault(pkg.lower(), {})[link] = True return to_filename(pkg), to_filename(ver) return None, None # process an index page into the package-page index for match in HREF.finditer(page): try: scan(urljoin(url, htmldecode(match.group(1)))) except ValueError: pass pkg, ver = scan(url) # ensure this page is in the page index if pkg: # process individual package page for new_url in find_external_links(url, page): # Process the found URL base, frag = egg_info_for_url(new_url) if base.endswith('.py') and not frag: if ver: new_url += '#egg=%s-%s' % (pkg, ver) else: self.need_version_info(url) self.scan_url(new_url) return PYPI_MD5.sub( lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page ) else: return "" # no sense double-scanning non-package pages
def process_index(self,url,page): """Process the contents of a PyPI page""" def scan(link): # Process a URL to see if it's for a package page if link.startswith(self.index_url): parts = list(map( unquote, link[len(self.index_url):].split('/') )) if len(parts)==2 and '#' not in parts[1]: # it's a package page, sanitize and index it pkg = safe_name(parts[0]) ver = safe_version(parts[1]) self.package_pages.setdefault(pkg.lower(),{})[link] = True return to_filename(pkg), to_filename(ver) return None, None # process an index page into the package-page index for match in HREF.finditer(page): try: scan(urljoin(url, htmldecode(match.group(1)))) except ValueError: pass pkg, ver = scan(url) # ensure this page is in the page index if pkg: # process individual package page for new_url in find_external_links(url, page): # Process the found URL base, frag = egg_info_for_url(new_url) if base.endswith('.py') and not frag: if ver: new_url+='#egg=%s-%s' % (pkg,ver) else: self.need_version_info(url) self.scan_url(new_url) return PYPI_MD5.sub( lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page ) else: return "" # no sense double-scanning non-package pages
return wrapper REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) # this line is here to fix emacs' cruddy broken syntax highlighting @unique_values def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" for match in REL.finditer(page): tag, rel = match.groups() rels = set(map(str.strip, rel.lower().split(','))) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): <<<<<<< HEAD yield urljoin(url, htmldecode(match.group(1))) ======= yield urllib.parse.urljoin(url, htmldecode(match.group(1))) >>>>>>> 54eef0be98b1b67c8507db91f4cfa90b64991027 for tag in ("<th>Home Page", "<th>Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: <<<<<<< HEAD yield urljoin(url, htmldecode(match.group(1))) ======= yield urllib.parse.urljoin(url, htmldecode(match.group(1))) >>>>>>> 54eef0be98b1b67c8507db91f4cfa90b64991027