def test_recursively_finds_http_links(self, mock_get): links_1 = [ "http://cs.ucla.edu/a", "http://cs.ucla.edu/b", "http://cs.ucla.edu/c" ] links_2 = [ "http://cs.ucla.edu/d", "http://cs.ucla.edu/e", "http://cs.ucla.edu/f" ] links_3 = [ "http://cs.ucla.edu/g", "http://cs.ucla.edu/h", "http://cs.ucla.edu/i" ] links_4 = [ "http://cs.ucla.edu/j", "http://cs.ucla.edu/k", "http://cs.ucla.edu/l" ] all_links = (links_1, links_2, links_3, links_4) all_htmls = [make_page_links(*links) for links in all_links] type(mock_get.return_value).text = mock.PropertyMock( side_effect=all_htmls) url = "http://cs.ucla.edu" links = get_page_links(url, max_depth=2) assert len(links) == sum(map(len, all_links)) assert set(links) == set(sum(all_links, []))
def test_returns_unique_links_in_order(self, mock_get): http_links = [ "http://cs.ucla.edu/a", "http://cs.ucla.edu/b", "http://cs.ucla.edu/c" ] html = make_page_links(*http_links, *http_links, *http_links) mock_get.return_value.text = html url = "http://cs.ucla.edu" links = get_page_links(url) assert links == http_links
def test_ignores_links_to_another_domain(self, mock_get): ucla_links = [ "http://cs.ucla.edu/a", "http://cs.ucla.edu/b", "http://cs.ucla.edu/c" ] non_ucla_links = [ "http://ucla.edu", "http://seas.ucla.edu", "http://stanford.edu" ] html = make_page_links(*ucla_links, *non_ucla_links) mock_get.return_value.text = html url = "http://cs.ucla.edu" links = get_page_links(url) assert links == ucla_links
def test_ignores_non_http_links(self, mock_get): http_links = [ "http://cs.ucla.edu/a", "http://cs.ucla.edu/b", "http://cs.ucla.edu/c" ] non_http_links = [ "ftp://b.com/test", "rss://a/feed", "data:image/gif;base64,R0lGODlhEAAJAIAAAP///wAAACH5BAEAAAAALAAAAAAQAAkAAAIKhI+py+0Po5yUFQA7", ] html = make_page_links(*http_links, *non_http_links) mock_get.return_value.text = html url = "http://cs.ucla.edu" links = get_page_links(url) assert links == http_links
def test_returns_empty_if_request_status_not_200(self, mock_get): mock_get.return_value.raise_for_status.side_effect = requests.exceptions.RequestException( ) url = "http://cs.ucla.edu" links = get_page_links(url) assert not links
def test_returns_empty_array_when_max_depth_reached(self): url = "http://cs.ucla.edu" links = get_page_links(url, max_depth=0) assert not links
def test_returns_empty_if_invalid_html_received(self, mock_get): mock_get.return_value.text = "<html><body></html>" url = "http://cs.ucla.edu" links = get_page_links(url) assert not links
def test_raises_if_nonrequest_related_exception_occurs(self, mock_get): mock_get.side_effect = RuntimeError() url = "http://cs.ucla.edu" with pytest.raises(RuntimeError): get_page_links(url)
def test_returns_empty_if_request_fails(self, mock_get): mock_get.side_effect = requests.exceptions.RequestException() url = "http://cs.ucla.edu" links = get_page_links(url) assert not links