def parse_index(self, disturl, html, scrape=True): p = HTMLPage(html, disturl.url) seen = set() for link in p.links: newurl = URL(link.url) if not newurl.is_valid_http_url(): continue eggfragment = newurl.eggfragment if scrape and eggfragment: if normalize_name(eggfragment).startswith(self.projectname): # XXX seems we have to maintain a particular # order to keep pip/easy_install happy with some # packages (e.g. nose) if newurl not in self.egglinks: self.egglinks.insert(0, newurl) else: log.debug("skip egg link %s (projectname: %s)", newurl, self.projectname) continue if is_archive_of_project(newurl, self.projectname): if not newurl.is_valid_http_url(): log.warn("unparseable/unsupported url: %r", newurl) else: seen.add(newurl.url) self._mergelink_ifbetter(newurl) continue if scrape: for link in p.rel_links(): if link.url not in seen: disturl = URL(link.url) if disturl.is_valid_http_url(): self.crawllinks.add(disturl)
def parse_index(self, disturl, html): p = HTMLPage(html, disturl.url) seen = set() for link in p.links: newurl = Link(link.url, requires_python=link.requires_python) if not newurl.is_valid_http_url(): continue if is_archive_of_project(newurl, self.project): if not newurl.is_valid_http_url(): threadlog.warn("unparseable/unsupported url: %r", newurl) else: seen.add(newurl.url) self._mergelink_ifbetter(newurl) continue