def extract_urls(text, baseurl=None): """Extract urls from text Args: text: text to be extracted from baseurl: base url reference for translating urls Returns: a list of urls """ tree = lxml.html.fromstring(text) bases = tree.xpath("//base/@href") if len(bases) > 0: baseurl = absurl(bases[-1], baseurl) urls = tree.xpath("//a/@href") # translate to absurl and rm urls = [absurl(_, baseurl) for _ in urls if _] return [_ for _ in urls if _]
def test_absurl_relative(self): # baseref is None assert absurl("www.example.com") is None assert absurl("/index.html") is None assert absurl("//example.com") == "http://example.com/" # SPECIAL # good baseref assert absurl("www.example.com", "http://example.com/index.html") == "http://example.com/www.example.com" assert absurl("/index.html", "http://example.com/help.html") == "http://example.com/index.html" # bad baseref assert absurl("/index.html", "www.example.com") is None assert absurl("/index.html", "/index.html") is None
def test_absurl_absolute(self): assert absurl("http://example.com/index.html?q=1") == "http://example.com/index.html?q=1" # path-absent assert absurl("http://example.com?q=1") == "http://example.com/?q=1" # non-lower assert absurl("HTTP://EXAMPLE.COM/INDEX.HTML?Q=1") == "http://example.com/INDEX.HTML?Q=1" # remove-fragment assert absurl("http://example.com/?q=1#footer") == "http://example.com/?q=1" # not http/https/ftp assert absurl("https://example.com/") == "https://example.com/" assert absurl("ftp://*****:*****@example.com/") == "ftp://*****:*****@example.com/" assert absurl("httpx://[email protected]/") is None