Example #1
0
    async def test_should_work_with_http_url(self, page_content, tmp_path,
                                             anyio_backend):
        url = 'http://quotes.com'
        respx.get(url, path='/robots.txt') % 404
        respx.get(url, path='/') % {'html': page_content('page1.html')}
        for i in range(2, 4):
            respx.get(url, path=f'/page{i}.html') % {
                'html': page_content(f'page{i}.html')
            }

        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}',
                               follow_robots_txt=True)
        static_spider = StaticSpider(urls=[url],
                                     parse=self.parse,
                                     config=config)
        await static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {f'{url}/page{i}.html' for i in range(2, 4)}

        assert stats.reachable_urls == {url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert stats.request_counter == 3
        assert stats.average_fetch_time > 0
        await self.common_assert(stats, backup_path)
Example #2
0
    async def test_should_work_with_file_url(self, page_1_file_url, tmp_path):
        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}')
        static_spider = StaticSpider(urls=[page_1_file_url],
                                     parse=self.parse,
                                     config=config)
        await static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {
            page_1_file_url.replace('1', '2').replace('///', '/'),
            page_1_file_url.replace('1', '3').replace('///', '/'),
        }

        assert stats.reachable_urls == {page_1_file_url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert 3 == stats.request_counter
        assert stats.average_fetch_time == static_spider._total_fetch_time / stats.request_counter
        await self.common_assert(stats, backup_path)
Example #3
0
    async def test_should_return_correct_statistics_after_running_spider(
            self, anyio_backend):
        url1 = 'http://foo.com'
        respx.get(url1, path='/')
        respx.get(f'{url1}', path='/robots.txt') % 404

        async def parse(*_) -> None:
            pass

        static_spider = StaticSpider(
            urls=[url1],
            parse=parse,
            config=Configuration(follow_robots_txt=True))
        await static_spider.run()
        stats = static_spider.statistics()

        assert stats.reachable_urls == {url1}
        assert stats.unreachable_urls == set()
        assert stats.followed_urls == set()
        assert stats.robot_excluded_urls == set()
        assert stats.request_counter == 1
        assert stats.total_time > 0
        assert stats.average_fetch_time > 0