コード例 #1
0
ファイル: contentprocessor.py プロジェクト: 7c00/minispider
def extract_urls(text, baseurl=None):
    """Extract urls from text

    Args:
        text: text to be extracted from
        baseurl: base url reference for translating urls

    Returns:
        a list of urls
    """
    tree = lxml.html.fromstring(text)
    bases = tree.xpath("//base/@href")
    if len(bases) > 0:
        baseurl = absurl(bases[-1], baseurl)
    urls = tree.xpath("//a/@href")

    # translate to absurl and rm
    urls = [absurl(_, baseurl) for _ in urls if _]
    return [_ for _ in urls if _]
コード例 #2
0
ファイル: test_urltools.py プロジェクト: 7c00/minispider
 def test_absurl_relative(self):
     # baseref is None
     assert absurl("www.example.com") is None
     assert absurl("/index.html") is None
     assert absurl("//example.com") == "http://example.com/"  # SPECIAL
     # good baseref
     assert absurl("www.example.com", "http://example.com/index.html") == "http://example.com/www.example.com"
     assert absurl("/index.html", "http://example.com/help.html") == "http://example.com/index.html"
     # bad baseref
     assert absurl("/index.html", "www.example.com") is None
     assert absurl("/index.html", "/index.html") is None
コード例 #3
0
ファイル: test_urltools.py プロジェクト: 7c00/minispider
 def test_absurl_absolute(self):
     assert absurl("http://example.com/index.html?q=1") == "http://example.com/index.html?q=1"
     # path-absent
     assert absurl("http://example.com?q=1") == "http://example.com/?q=1"
     # non-lower
     assert absurl("HTTP://EXAMPLE.COM/INDEX.HTML?Q=1") == "http://example.com/INDEX.HTML?Q=1"
     # remove-fragment
     assert absurl("http://example.com/?q=1#footer") == "http://example.com/?q=1"
     # not http/https/ftp
     assert absurl("https://example.com/") == "https://example.com/"
     assert absurl("ftp://*****:*****@example.com/") == "ftp://*****:*****@example.com/"
     assert absurl("httpx://[email protected]/") is None