Exemple #1
0
 def parse_index(self, disturl, html, scrape=True):
     p = HTMLPage(html, disturl.url)
     seen = set()
     for link in p.links:
         newurl = URL(link.url)
         if not newurl.is_valid_http_url():
             continue
         eggfragment = newurl.eggfragment
         if scrape and eggfragment:
             if normalize_name(eggfragment).startswith(self.projectname):
                 # XXX seems we have to maintain a particular
                 # order to keep pip/easy_install happy with some
                 # packages (e.g. nose)
                 if newurl not in self.egglinks:
                     self.egglinks.insert(0, newurl)
             else:
                 log.debug("skip egg link %s (projectname: %s)",
                           newurl, self.projectname)
             continue
         if is_archive_of_project(newurl, self.projectname):
             if not newurl.is_valid_http_url():
                 log.warn("unparseable/unsupported url: %r", newurl)
             else:
                 seen.add(newurl.url)
                 self._mergelink_ifbetter(newurl)
                 continue
     if scrape:
         for link in p.rel_links():
             if link.url not in seen:
                 disturl = URL(link.url)
                 if disturl.is_valid_http_url():
                     self.crawllinks.add(disturl)
Exemple #2
0
 def parse_index(self, disturl, html, scrape=True):
     p = HTMLPage(html, disturl.url)
     seen = set()
     for link in p.links:
         newurl = URL(link.url)
         if not newurl.is_valid_http_url():
             continue
         eggfragment = newurl.eggfragment
         if scrape and eggfragment:
             if normalize_name(eggfragment).startswith(self.projectname):
                 # XXX seems we have to maintain a particular
                 # order to keep pip/easy_install happy with some
                 # packages (e.g. nose)
                 if newurl not in self.egglinks:
                     self.egglinks.insert(0, newurl)
             else:
                 log.debug("skip egg link %s (projectname: %s)", newurl,
                           self.projectname)
             continue
         if is_archive_of_project(newurl, self.projectname):
             if not newurl.is_valid_http_url():
                 log.warn("unparseable/unsupported url: %r", newurl)
             else:
                 seen.add(newurl.url)
                 self._mergelink_ifbetter(newurl)
                 continue
     if scrape:
         for link in p.rel_links():
             if link.url not in seen:
                 disturl = URL(link.url)
                 if disturl.is_valid_http_url():
                     self.crawllinks.add(disturl)
Exemple #3
0
 def parse_index(self, disturl, html):
     p = HTMLPage(html, disturl.url)
     seen = set()
     for link in p.links:
         newurl = Link(link.url, requires_python=link.requires_python)
         if not newurl.is_valid_http_url():
             continue
         if is_archive_of_project(newurl, self.project):
             if not newurl.is_valid_http_url():
                 threadlog.warn("unparseable/unsupported url: %r", newurl)
             else:
                 seen.add(newurl.url)
                 self._mergelink_ifbetter(newurl)
                 continue