Beispiel #1
0
    async def test_should_return_false_when_requesting_forbidden_url(
            self, httpx_mock, tmp_path, robots_content, url_path):
        analyzer = RobotsAnalyzer(user_agent='Googlebot',
                                  robots_cache=tmp_path)
        httpx_mock.get('/robots.txt') % {'text': robots_content}

        assert await analyzer.can_fetch(f'http://example.com/{url_path}/1'
                                        ) is False
Beispiel #2
0
    async def test_should_call_http_client_aclose_method(self, tmp_path):
        http_client_mock = mock.AsyncMock()
        analyzer = RobotsAnalyzer(
            robots_cache=tmp_path,
            user_agent='Mozilla/5.0',
            http_client=http_client_mock,
        )
        await analyzer.close()

        http_client_mock.aclose.assert_awaited_once()
Beispiel #3
0
    def test_should_correctly_instantiate_class_without_giving_httpx_response(
            self, tmp_path):
        analyzer = RobotsAnalyzer(user_agent='Mozilla/5.0',
                                  robots_cache=tmp_path)

        assert 'Mozilla/5.0' == analyzer._user_agent
        assert tmp_path == analyzer._robots_cache
        assert isinstance(analyzer._http_client, httpx.AsyncClient)
        assert 'Mozilla/5.0' == analyzer._http_client.headers['User-Agent']
        assert isinstance(analyzer._robots_parser, RobotFileParser)
        assert_dicts(analyzer._robots_mapping, {})
        assert_dicts(analyzer._delay_mapping, {})
Beispiel #4
0
    async def test_should_call_can_fetch_only_one_time(self, mocker, tmp_path):
        url = 'http://example.com/page/1'
        can_fetch_mock = mocker.patch(
            'scalpel.any_io.robots.RobotsAnalyzer.can_fetch',
            new=mock.AsyncMock())
        can_fetch_mock.return_value = False
        analyzer = RobotsAnalyzer(robots_cache=tmp_path,
                                  user_agent='Mozilla/5.0')

        assert -1 == await analyzer.get_request_delay(url, 0)
        assert -1 == await analyzer.get_request_delay(url, 0)
        can_fetch_mock.assert_awaited_once_with(url)
Beispiel #5
0
    def test_should_correctly_instantiate_class_with_httpx_response_passed_as_argument(
            self, tmp_path):
        http_client = httpx.AsyncClient(headers={'User-Agent': 'python-httpx'})
        analyzer = RobotsAnalyzer(user_agent='Mozilla/5.0',
                                  robots_cache=tmp_path,
                                  http_client=http_client)

        assert 'Mozilla/5.0' == analyzer._user_agent
        assert tmp_path == analyzer._robots_cache
        assert isinstance(analyzer._http_client, httpx.AsyncClient)
        assert 'python-httpx' == analyzer._http_client.headers['User-Agent']
        assert isinstance(analyzer._robots_parser, RobotFileParser)
        assert_dicts(analyzer._robots_mapping, {})
        assert_dicts(analyzer._delay_mapping, {})
Beispiel #6
0
    async def test_should_return_delay_if_it_is_in_internal_delay_mapping(
            self, mocker, tmp_path):
        crawl_delay_mock = mocker.patch(
            'urllib.robotparser.RobotFileParser.crawl_delay')
        can_fetch_mock = mocker.patch(
            'scalpel.any_io.robots.RobotsAnalyzer.can_fetch')
        delay = 2
        analyzer = RobotsAnalyzer(robots_cache=tmp_path,
                                  user_agent='Mozilla/5.0')
        analyzer._delay_mapping['example.com'] = delay

        assert await analyzer.get_request_delay('http://example.com/page/1',
                                                0) == delay
        can_fetch_mock.assert_not_called()
        crawl_delay_mock.assert_not_called()
Beispiel #7
0
def anyio_analyzer(tmp_path):
    return RobotsAnalyzer(user_agent='Mozilla/5.0', robots_cache=tmp_path)