Example #1
0
    def handle_endtag(self, tag):
        if self._link_counter is not None:
            if self._link_counter < 1:
                if tag != 'a':
                    logger.warn(
                        u'Invalid HTML tags in %s', self._url)
                href = self._get_link_attr('href')
                #  We discard anchors and empty href.
                if href and href[0] != '#':
                    href_parts = urlparse.urlparse(href)
                    # Convert absolute URL to absolute URI
                    if href[0] == '/':
                        href = urlparse.urlunparse(
                            self._base_uri +  href_parts[2:])
                    elif not is_remote_uri(href):
                        # Handle relative URL
                        href = urlparse.urlunparse(
                            self._base_uri +
                            ('/'.join((self._relative_path, href_parts[2])),) +
                            href_parts[3:])

                    filename = os.path.basename(href_parts[2])
                    # If the content of the link is empty, we use the last
                    # part of path.
                    if self._buffer:
                        name = ' '.join(self._buffer)
                    else:
                        name = filename
                    rel = self._get_link_attr('rel')
                    self.links.append((href, filename, name, rel),)
                self._link_counter = None
                self._link_attrs = None
                self._buffer = None
            else:
                self._link_counter -= 1
Example #2
0
 def __call__(self, uri):
     if is_remote_uri(uri):
         return self.download(uri)
     return uri