def test_should_raise_error_if_parse_function_raises_error_and_ignore_errors_is_false(self): def parse(*_): raise ValueError('just a test') url = 'http://foo.com' respx.get(url) static_spider = StaticSpider(urls=[url], parse=parse, ignore_errors=False) with pytest.raises(ValueError) as exc_info: static_spider._handle_url(url) assert 'just a test' == str(exc_info.value)
def test_should_not_raise_error_if_parse_function_raises_error_and_ignore_errors_is_true(self): def parse(*_): raise ValueError('just a test') url = 'http://foo.com' respx.get(url) static_spider = StaticSpider(urls=[url], parse=parse, ignore_errors=True) try: static_spider._handle_url(url) except ValueError: pytest.fail('unexpected ValueError raised when ignore_errors is set to true')
def test_should_not_called_parse_method_if_httpx_response_is_an_error_one(self, mocker, status_code): parse_args = [] url = 'http://foo.com' def parse(spider, response): parse_args.extend([spider, response]) respx.get(url) % status_code logger_mock = mocker.patch('logging.Logger.info') static_spider = StaticSpider(urls=[url], parse=parse) static_spider._handle_url(url) assert [] == parse_args logger_mock.assert_any_call('fetching url %s returns an error with status code %s', url, status_code)
def test_should_not_called_parse_method_when_file_cannot_be_opened(self, tmp_path, mocker): logger_mock = mocker.patch('logging.Logger.exception') hello_file = tmp_path / 'hello.txt' file_url = hello_file.resolve().as_uri() parse_args = [] def parse(spider, response): parse_args.extend([spider, response]) static_spider = StaticSpider(urls=[file_url], parse=parse) static_spider._handle_url(file_url) assert [] == parse_args logger_mock.assert_any_call('unable to open file %s', file_url) assert {file_url} == static_spider.unreachable_urls assert set() == static_spider.reachable_urls
def test_should_not_called_parse_method_if_url_is_forbidden_by_robots_txt(self, mocker): parse_args = [] url = 'http://foo.com' def parse(spider, response): parse_args.extend([spider, response]) respx.get(f'{url}/robots.txt') % 401 logger_mock = mocker.patch('logging.Logger.info') static_spider = StaticSpider(urls=[url], parse=parse, config=Configuration(follow_robots_txt=True)) static_spider._handle_url(url) assert [] == parse_args assert {url} == static_spider.robots_excluded_urls logger_mock.assert_any_call( 'robots.txt rule has forbidden the processing of url %s or the url is not reachable', url )
def test_should_read_file_content_when_giving_a_file_url(self, tmp_path): parse_args = [] hello_file = tmp_path / 'hello.txt' hello_file.write_text('hello world') file_url = hello_file.resolve().as_uri() def parse(spider, response): parse_args.extend([spider, response]) static_spider = StaticSpider(urls=[file_url], parse=parse) static_spider._handle_url(file_url) assert parse_args[0] is static_spider static_response = parse_args[1] assert isinstance(static_response, StaticResponse) assert file_url == static_response._url assert 'hello world' == static_response._text assert static_response._httpx_response is None assert {file_url} == static_spider.reachable_urls
def test_should_fetch_content_when_giving_http_url(self): parse_args = [] url = 'http://foo.com' def parse(spider, response): parse_args.extend([spider, response]) respx.get(url) % {'status_code': 200, 'text': 'http content'} static_spider = StaticSpider(urls=[url], parse=parse) static_spider._handle_url(url) assert parse_args[0] is static_spider static_response = parse_args[1] assert isinstance(static_response, StaticResponse) assert '' == static_response._url assert '' == static_response._text assert 200 == static_response._httpx_response.status_code assert 'http content' == static_response._httpx_response.text assert 1 == static_spider.request_counter assert static_spider._total_fetch_time > 0