def parse_index(self, disturl, html, scrape=True): p = HTMLPage(html, disturl.url) seen = set() for link in p.links: newurl = URL(link.url) if not newurl.is_valid_http_url(): continue eggfragment = newurl.eggfragment if scrape and eggfragment: if normalize_name(eggfragment).startswith(self.projectname): # XXX seems we have to maintain a particular # order to keep pip/easy_install happy with some # packages (e.g. nose) if newurl not in self.egglinks: self.egglinks.insert(0, newurl) else: log.debug("skip egg link %s (projectname: %s)", newurl, self.projectname) continue if is_archive_of_project(newurl, self.projectname): if not newurl.is_valid_http_url(): log.warn("unparseable/unsupported url: %r", newurl) else: seen.add(newurl.url) self._mergelink_ifbetter(newurl) continue if scrape: for link in p.rel_links(): if link.url not in seen: disturl = URL(link.url) if disturl.is_valid_http_url(): self.crawllinks.add(disturl)
def _get_remote_projects(self): headers = {"Accept": "text/html"} # use a minimum of 30 seconds as timeout for remote server and # 60s when running as replica, because the list can be quite large # and the master might take a while to process it if self.xom.is_replica(): timeout = max(self.timeout, 60) else: timeout = max(self.timeout, 30) response = self.httpget( self.mirror_url, allow_redirects=True, extra_headers=headers, timeout=timeout) if response.status_code != 200: raise self.UpstreamError("URL %r returned %s %s", self.mirror_url, response.status_code, response.reason) page = HTMLPage(response.text, response.url) projects = set() baseurl = URL(response.url) basehost = baseurl.replace(path='') for link in page.links: newurl = URL(link.url) # remove trailing slashes, so basename works correctly newurl = newurl.asfile() if not newurl.is_valid_http_url(): continue if not newurl.path.startswith(baseurl.path): continue if basehost != newurl.replace(path=''): continue projects.add(newurl.basename) return projects
def parse_index(self, disturl, html): p = HTMLPage(html, disturl.url) seen = set() for link in p.links: newurl = Link(link.url, requires_python=link.requires_python) if not newurl.is_valid_http_url(): continue if is_archive_of_project(newurl, self.project): if not newurl.is_valid_http_url(): threadlog.warn("unparseable/unsupported url: %r", newurl) else: seen.add(newurl.url) self._mergelink_ifbetter(newurl) continue
def _get_remote_projects(self): headers = {"Accept": "text/html"} response = self.httpget(self.mirror_url, allow_redirects=True, extra_headers=headers) if response.status_code != 200: raise self.UpstreamError("URL %r returned %s", self.mirror_url, response.status_code) page = HTMLPage(response.text, response.url) projects = set() baseurl = URL(response.url) basehost = baseurl.replace(path='') for link in page.links: newurl = URL(link.url) # remove trailing slashes, so basename works correctly newurl = newurl.asfile() if not newurl.is_valid_http_url(): continue if not newurl.path.startswith(baseurl.path): continue if basehost != newurl.replace(path=''): continue projects.add(newurl.basename) return projects
def parselinks(htmlcontent, indexurl): from devpi_common.vendor._pip import HTMLPage page = HTMLPage(htmlcontent, indexurl) return list(page.links)