Example #1
0
    def links(self):
        """Yields all links in the page"""

        # CHANGED from PIP original:
        # use HTMLParser instead of re
        # and store data-requires-python
        class AnchorParser(html_parser.HTMLParser, object):
            def __init__(self, *args, **kwargs):
                super(AnchorParser, self).__init__(*args, **kwargs)
                self.anchors = []

            def handle_starttag(self, tag, attrs):
                if not tag == 'a':
                    return

                for key, value in attrs:
                    if key == 'href':
                        self.anchors.append(dict(attrs))
                        break

        parser = AnchorParser()
        parser.feed(self.content)
        parser.close()

        for anchor in parser.anchors:
            url = anchor['href']

            # CHANGED from PIP original: catch parsing errors
            try:
                url = self.clean_link(urljoin(self.base_url, url))
            except ValueError:
                continue

            pyrequire = anchor.get('data-requires-python')
            yield Link(url, self, requires_python=pyrequire)
Example #2
0
File: _pip.py Project: kaiix/devpi
 def links(self):
     """Yields all links in the page"""
     for match in self._href_re.finditer(self.content):
         url = match.group(1) or match.group(2) or match.group(3)
         # CHANGED from PIP original: catch parsing errors
         try:
             url = self.clean_link(urljoin(self.base_url, url))
         except ValueError:
             continue
         yield Link(url, self)
Example #3
0
 def links(self):
     """Yields all links in the page"""
     for match in self._href_re.finditer(self.content):
         url = match.group(1) or match.group(2) or match.group(3)
         # CHANGED from PIP original: catch parsing errors
         try:
             url = self.clean_link(urljoin(self.base_url, url))
         except ValueError:
             continue
         yield Link(url, self)
Example #4
0
 def scraped_rel_links(self):
     for regex in (self._homepage_re, self._download_re):
         match = regex.search(self.content)
         if not match:
             continue
         href_match = self._href_re.search(self.content, pos=match.end())
         if not href_match:
             continue
         url = href_match.group(1) or href_match.group(2) or href_match.group(3)
         if not url:
             continue
         url = self.clean_link(urljoin(self.base_url, url))
         yield Link(url, self)
Example #5
0
 def scraped_rel_links(self):
     for regex in (self._homepage_re, self._download_re):
         match = regex.search(self.content)
         if not match:
             continue
         href_match = self._href_re.search(self.content, pos=match.end())
         if not href_match:
             continue
         url = href_match.group(1) or href_match.group(2) or href_match.group(3)
         if not url:
             continue
         url = self.clean_link(urljoin(self.base_url, url))
         yield Link(url, self)
Example #6
0
File: _pip.py Project: kaiix/devpi
 def explicit_rel_links(self, rels=('homepage', 'download')):
     """Yields all links with the given relations"""
     for match in self._rel_re.finditer(self.content):
         found_rels = match.group(1).lower().split()
         for rel in rels:
             if rel in found_rels:
                 break
         else:
             continue
         match = self._href_re.search(match.group(0))
         if not match:
             continue
         url = match.group(1) or match.group(2) or match.group(3)
         url = self.clean_link(urljoin(self.base_url, url))
         yield Link(url, self)
Example #7
0
 def explicit_rel_links(self, rels=('homepage', 'download')):
     """Yields all links with the given relations"""
     for match in self._rel_re.finditer(self.content):
         found_rels = match.group(1).lower().split()
         for rel in rels:
             if rel in found_rels:
                 break
         else:
             continue
         match = self._href_re.search(match.group(0))
         if not match:
             continue
         url = match.group(1) or match.group(2) or match.group(3)
         url = self.clean_link(urljoin(self.base_url, url))
         yield Link(url, self)