Beispiel #1
0
 def parse_index(self, disturl, html, scrape=True):
     p = HTMLPage(html, disturl.url)
     seen = set()
     for link in p.links:
         newurl = URL(link.url)
         if not newurl.is_valid_http_url():
             continue
         eggfragment = newurl.eggfragment
         if scrape and eggfragment:
             if normalize_name(eggfragment).startswith(self.projectname):
                 # XXX seems we have to maintain a particular
                 # order to keep pip/easy_install happy with some
                 # packages (e.g. nose)
                 if newurl not in self.egglinks:
                     self.egglinks.insert(0, newurl)
             else:
                 log.debug("skip egg link %s (projectname: %s)",
                           newurl, self.projectname)
             continue
         if is_archive_of_project(newurl, self.projectname):
             if not newurl.is_valid_http_url():
                 log.warn("unparseable/unsupported url: %r", newurl)
             else:
                 seen.add(newurl.url)
                 self._mergelink_ifbetter(newurl)
                 continue
     if scrape:
         for link in p.rel_links():
             if link.url not in seen:
                 disturl = URL(link.url)
                 if disturl.is_valid_http_url():
                     self.crawllinks.add(disturl)
Beispiel #2
0
 def parse_index(self, disturl, html, scrape=True):
     p = HTMLPage(html, disturl.url)
     seen = set()
     for link in p.links:
         newurl = URL(link.url)
         if not newurl.is_valid_http_url():
             continue
         eggfragment = newurl.eggfragment
         if scrape and eggfragment:
             if normalize_name(eggfragment).startswith(self.projectname):
                 # XXX seems we have to maintain a particular
                 # order to keep pip/easy_install happy with some
                 # packages (e.g. nose)
                 if newurl not in self.egglinks:
                     self.egglinks.insert(0, newurl)
             else:
                 log.debug("skip egg link %s (projectname: %s)", newurl,
                           self.projectname)
             continue
         if is_archive_of_project(newurl, self.projectname):
             if not newurl.is_valid_http_url():
                 log.warn("unparseable/unsupported url: %r", newurl)
             else:
                 seen.add(newurl.url)
                 self._mergelink_ifbetter(newurl)
                 continue
     if scrape:
         for link in p.rel_links():
             if link.url not in seen:
                 disturl = URL(link.url)
                 if disturl.is_valid_http_url():
                     self.crawllinks.add(disturl)
Beispiel #3
0
 def _get_remote_projects(self):
     headers = {"Accept": "text/html"}
     # use a minimum of 30 seconds as timeout for remote server and
     # 60s when running as replica, because the list can be quite large
     # and the master might take a while to process it
     if self.xom.is_replica():
         timeout = max(self.timeout, 60)
     else:
         timeout = max(self.timeout, 30)
     response = self.httpget(
         self.mirror_url, allow_redirects=True, extra_headers=headers,
         timeout=timeout)
     if response.status_code != 200:
         raise self.UpstreamError("URL %r returned %s %s",
             self.mirror_url, response.status_code, response.reason)
     page = HTMLPage(response.text, response.url)
     projects = set()
     baseurl = URL(response.url)
     basehost = baseurl.replace(path='')
     for link in page.links:
         newurl = URL(link.url)
         # remove trailing slashes, so basename works correctly
         newurl = newurl.asfile()
         if not newurl.is_valid_http_url():
             continue
         if not newurl.path.startswith(baseurl.path):
             continue
         if basehost != newurl.replace(path=''):
             continue
         projects.add(newurl.basename)
     return projects
Beispiel #4
0
 def parse_index(self, disturl, html):
     p = HTMLPage(html, disturl.url)
     seen = set()
     for link in p.links:
         newurl = Link(link.url, requires_python=link.requires_python)
         if not newurl.is_valid_http_url():
             continue
         if is_archive_of_project(newurl, self.project):
             if not newurl.is_valid_http_url():
                 threadlog.warn("unparseable/unsupported url: %r", newurl)
             else:
                 seen.add(newurl.url)
                 self._mergelink_ifbetter(newurl)
                 continue
Beispiel #5
0
 def _get_remote_projects(self):
     headers = {"Accept": "text/html"}
     response = self.httpget(self.mirror_url, allow_redirects=True, extra_headers=headers)
     if response.status_code != 200:
         raise self.UpstreamError("URL %r returned %s",
                             self.mirror_url, response.status_code)
     page = HTMLPage(response.text, response.url)
     projects = set()
     baseurl = URL(response.url)
     basehost = baseurl.replace(path='')
     for link in page.links:
         newurl = URL(link.url)
         # remove trailing slashes, so basename works correctly
         newurl = newurl.asfile()
         if not newurl.is_valid_http_url():
             continue
         if not newurl.path.startswith(baseurl.path):
             continue
         if basehost != newurl.replace(path=''):
             continue
         projects.add(newurl.basename)
     return projects
Beispiel #6
0
def parselinks(htmlcontent, indexurl):
    from devpi_common.vendor._pip import HTMLPage
    page = HTMLPage(htmlcontent, indexurl)
    return list(page.links)