Exemple #1
0
    def test_should_work_with_file_url(self, page_1_file_url, tmp_path):
        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor], backup_filename=f'{backup_path}')
        static_spider = StaticSpider(urls=[page_1_file_url], parse=self.parse, config=config)
        static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {
            page_1_file_url.replace('1', '2').replace('///', '/'),
            page_1_file_url.replace('1', '3').replace('///', '/'),
        }

        assert stats.reachable_urls == {page_1_file_url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert stats.request_counter == 3
        assert stats.average_fetch_time == static_spider._total_fetch_time / stats.request_counter
        self.common_assert(stats, backup_path)
Exemple #2
0
    def test_should_return_correct_statistics_after_running_spider(self):
        url1 = 'http://foo.com'
        url2 = 'http://bar.com'
        respx.get(url1, path='/')
        respx.get(url1, path='/robots.txt') % 404
        respx.get(f'{url2}/robots.txt') % 401

        config = Configuration(follow_robots_txt=True)
        static_spider = StaticSpider(urls=[url1, url2], parse=lambda x, y: None, config=config)
        static_spider.run()
        stats = static_spider.statistics()

        assert stats.reachable_urls == {url1}
        assert stats.unreachable_urls == set()
        assert stats.followed_urls == set()
        assert stats.robot_excluded_urls == {url2}
        assert stats.request_counter == 1
        assert stats.total_time > 0
        assert stats.average_fetch_time > 0
Exemple #3
0
    def test_should_work_with_http_url(self, page_content, tmp_path):
        url = 'http://quotes.com'
        respx.get(url, path='/robots.txt') % 404
        respx.get(url, path='/') % {'html': page_content('page1.html')}
        for i in range(2, 4):
            respx.get(url, path=f'/page{i}.html') % {'html': page_content(f'page{i}.html')}

        backup_path = tmp_path / 'backup.mp'
        config = Configuration(
            item_processors=[self.processor], backup_filename=f'{backup_path}', follow_robots_txt=True
        )
        static_spider = StaticSpider(urls=[url], parse=self.parse, config=config)
        static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {f'{url}/page{i}.html' for i in range(2, 4)}

        assert stats.reachable_urls == {url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert stats.request_counter == 3
        assert stats.average_fetch_time == static_spider._total_fetch_time / stats.request_counter
        self.common_assert(stats, backup_path)