def handle_endtag(self, tag): if self._link_counter is not None: if self._link_counter < 1: if tag != 'a': logger.warn( u'Invalid HTML tags in %s', self._url) href = self._get_link_attr('href') # We discard anchors and empty href. if href and href[0] != '#': href_parts = urlparse.urlparse(href) # Convert absolute URL to absolute URI if href[0] == '/': href = urlparse.urlunparse( self._base_uri + href_parts[2:]) elif not is_remote_uri(href): # Handle relative URL href = urlparse.urlunparse( self._base_uri + ('/'.join((self._relative_path, href_parts[2])),) + href_parts[3:]) filename = os.path.basename(href_parts[2]) # If the content of the link is empty, we use the last # part of path. if self._buffer: name = ' '.join(self._buffer) else: name = filename rel = self._get_link_attr('rel') self.links.append((href, filename, name, rel),) self._link_counter = None self._link_attrs = None self._buffer = None else: self._link_counter -= 1
def __call__(self, uri): if is_remote_uri(uri): return self.download(uri) return uri