async def test_should_work_with_http_url(self, page_content, tmp_path, anyio_backend): url = 'http://quotes.com' respx.get(url, path='/robots.txt') % 404 respx.get(url, path='/') % {'html': page_content('page1.html')} for i in range(2, 4): respx.get(url, path=f'/page{i}.html') % { 'html': page_content(f'page{i}.html') } backup_path = tmp_path / 'backup.mp' config = Configuration(item_processors=[self.processor], backup_filename=f'{backup_path}', follow_robots_txt=True) static_spider = StaticSpider(urls=[url], parse=self.parse, config=config) await static_spider.run() stats = static_spider.statistics() followed_urls = {f'{url}/page{i}.html' for i in range(2, 4)} assert stats.reachable_urls == {url} | followed_urls assert stats.followed_urls == followed_urls assert stats.request_counter == 3 assert stats.average_fetch_time > 0 await self.common_assert(stats, backup_path)
async def test_should_work_with_file_url(self, page_1_file_url, tmp_path): backup_path = tmp_path / 'backup.mp' config = Configuration(item_processors=[self.processor], backup_filename=f'{backup_path}') static_spider = StaticSpider(urls=[page_1_file_url], parse=self.parse, config=config) await static_spider.run() stats = static_spider.statistics() followed_urls = { page_1_file_url.replace('1', '2').replace('///', '/'), page_1_file_url.replace('1', '3').replace('///', '/'), } assert stats.reachable_urls == {page_1_file_url} | followed_urls assert stats.followed_urls == followed_urls assert 3 == stats.request_counter assert stats.average_fetch_time == static_spider._total_fetch_time / stats.request_counter await self.common_assert(stats, backup_path)
async def test_should_return_correct_statistics_after_running_spider( self, anyio_backend): url1 = 'http://foo.com' respx.get(url1, path='/') respx.get(f'{url1}', path='/robots.txt') % 404 async def parse(*_) -> None: pass static_spider = StaticSpider( urls=[url1], parse=parse, config=Configuration(follow_robots_txt=True)) await static_spider.run() stats = static_spider.statistics() assert stats.reachable_urls == {url1} assert stats.unreachable_urls == set() assert stats.followed_urls == set() assert stats.robot_excluded_urls == set() assert stats.request_counter == 1 assert stats.total_time > 0 assert stats.average_fetch_time > 0