def test_html_page_should_be_able_to_scrap_rel_links(): """ Test scraping page looking for url in href """ page = HTMLPage(""" <!-- The <th> elements below are a terrible terrible hack for setuptools --> <li> <strong>Home Page:</strong> <!-- <th>Home Page --> <a href="http://supervisord.org/">http://supervisord.org/</a> </li>""", "supervisor") links = list(page.scraped_rel_links()) assert len(links) == 1 assert links[0].url == 'http://supervisord.org/'
def test_html_page_should_be_able_to_scrap_rel_links(): """ Test scraping page looking for url in href """ page = HTMLPage( """ <!-- The <th> elements below are a terrible terrible hack for setuptools --> <li> <strong>Home Page:</strong> <!-- <th>Home Page --> <a href="http://supervisord.org/">http://supervisord.org/</a> </li>""", "supervisor") links = list(page.scraped_rel_links()) assert len(links) == 1 assert links[0].url == 'http://supervisord.org/'
def test_html_page_should_be_able_to_filter_links_by_rel(): """ Test selecting links by the rel attribute """ page = HTMLPage(""" <a href="http://example.com/page.html">Some page</a> <a href="http://example.com/archive-1.2.3.tar.gz" rel="download">Download URL</a> <a href="http://example.com/home.html" rel="homepage">Homepage</a> """, "archive") links = list(page.rel_links()) urls = [l.url for l in links] hlinks = list(page.rel_links(('homepage',))) dlinks = list(page.rel_links(('download',))) assert len(links) == 2 assert 'http://example.com/archive-1.2.3.tar.gz' in urls assert 'http://example.com/home.html' in urls assert len(hlinks) == 1 assert hlinks[0].url == 'http://example.com/home.html' assert len(dlinks) == 1 assert dlinks[0].url == 'http://example.com/archive-1.2.3.tar.gz'
def test_base_url(html, url, expected): assert HTMLPage(html, url).base_url == expected