def run(self): _LOG.info("Fetch links from url: %s", self.url) header = { # most common user-agent 'user-agent': 'Mozilla/5.0 ;Windows NT 6.1; WOW64; Trident/7.0; rv:11.0; like Gecko', } page = requests.get(self.url, headers=header) tree = html.fromstring(page.text) tree.make_links_absolute(self.url, resolve_base_href=True) elements_links = [] for element, attribute, link, pos in tree.iterlinks(): if self.regex.search(link): elements_links.append((element, link)) else: _LOG.debug("Skip [%s]", link) num_links = len(elements_links) for i, (element, link) in enumerate(elements_links): if self.regex.search(link): _LOG.info( "(%s/%s) %s (%s)", i + 1, num_links, element.text, link, ) new_filepath = os.path.join( self.out_dir, webutil.unquote_url(os.path.basename(link)), ) if os.path.exists(new_filepath): _LOG.info( "Skip [%s] since [%s] already exists", link, new_filepath, ) continue _LOG.info("Download to [%s]", new_filepath) if not self.dry_run: tmp_filepath = new_filepath + ".tmp" webutil.download_file(link, tmp_filepath) os.rename(tmp_filepath, new_filepath)
def test_unquote_url_OneEncodedCharacter_ReturnUnquoted(self): self.assertEqual( 'http://abc.com/hello world', webutil.unquote_url('http://abc.com/hello%20world'), )
def test_unquote_url_NoSpecialCharacter_ReturnAsIs(self): self.assertEqual( 'http://abc.com', webutil.unquote_url('http://abc.com'), )
def test_unquote_url_EmptyString(self): self.assertEqual('', webutil.unquote_url(''))