def test_oneimage_page(): spider = CrawlitSpider() items = list(spider.parse(mock_response_from_file("html/oneimage.html"))) assert len(items[0]["internal_links"]) == 0 assert len(items[0]["external_links"]) == 0 images = items[0]["static_content"] assert len(images) == 1 assert images[0] == "monalisa.jpg"
def test_html5_vid_page(): spider = CrawlitSpider() items = list(spider.parse(mock_response_from_file("html/html5video.html"))) assert len(items[0]["internal_links"]) == 0 assert len(items[0]["external_links"]) == 0 videos = items[0]["static_content"] assert len(videos) == 2 assert "movie.mp4" in videos assert "movie.ogg" in videos
def test_one_internal_link_page(): spider = CrawlitSpider() spider.allowed_domains = ["www.example.com"] results = list( spider.parse(mock_response_from_file("html/oneinternal.html"))) assert isinstance(results[0], CrawlitItem) assert len(results[0]["external_links"]) == 0 internal_links = results[0]["internal_links"] assert internal_links == ["about_us.html"] assert isinstance(results[1], Request)
def test_onelocationhash_within_page(): spider = CrawlitSpider() spider.allowed_domains = ["www.example.com"] results = list( spider.parse(mock_response_from_file("html/onelocationhash.html"))) assert len(results) == 1 assert isinstance(results[0], CrawlitItem) assert len(results[0]["external_links"]) == 0 internal_links = results[0]["internal_links"] assert internal_links == ["#about_us"]
def test_one_external_link_page(): spider = CrawlitSpider() spider.allowed_domains = ["www.example.com"] results = list( spider.parse(mock_response_from_file("html/oneexternal.html"))) assert len(results) == 1 assert isinstance(results[0], CrawlitItem) external_links = results[0]["external_links"] assert external_links == [ "http://www.interesting.com/interesting_page.html" ] assert len(results[0]["internal_links"]) == 0
def test_linkless_page(): spider = CrawlitSpider() items = list(spider.parse(mock_response_from_file("html/linkless.html"))) assert len(items) == 1 assert len(items[0]["internal_links"]) == 0 assert len(items[0]["external_links"]) == 0
def test_remove_invalid_javascript_call_links(): spider = CrawlitSpider() links = spider.remove_invalid_links( ["javascript:void(0)", "javascript:void(0);"]) assert links == []