def test_should_raise_error_when_file_is_not_path_or_string( self, env_file): with pytest.raises(TypeError) as exc_info: Configuration.load_from_dotenv(env_file) assert f'env file must be of type Path or str but you provided {type(env_file)}' == str( exc_info.value)
def test_should_raise_error_when_file_has_not_correct_type( self, test_file): with pytest.raises(TypeError) as exc_info: Configuration._check_file(test_file, 'txt') assert f'txt file must be of type Path or str but you provided {type(test_file)}' == str( exc_info.value)
def test_should_raise_error_when_file_is_not_valid_yaml(self, tmp_path): yaml_file = tmp_path / 'foo.yaml' lines = """ [scalpel] foo = bar """ yaml_file.write_text(lines) with pytest.raises(DecodeError): Configuration.load_from_yaml(yaml_file)
def test_should_not_raise_error_when_value_is_none(self): try: Configuration(selenium_driver_log_file=None) except ValueError: pytest.fail( 'unexpected error when setting selenium_driver_log_path with None value' )
def test_should_not_raise_error_when_giving_correct_config_argument( self, default_spider_arguments): config = Configuration(fetch_timeout=0) try: Spider(**default_spider_arguments, config=config) except TypeError as e: pytest.fail(f'unexpected error when instantiating spider: {e}')
def test_request_property_is_between_min_and_max_delay( self, min_delay, max_delay): config = Configuration(min_request_delay=min_delay, max_request_delay=max_delay) assert config.min_request_delay <= config.request_delay assert config.request_delay <= config.max_request_delay
async def test_should_work_with_http_url(self, page_content, tmp_path, anyio_backend): url = 'http://quotes.com' respx.get(url, path='/robots.txt') % 404 respx.get(url, path='/') % {'html': page_content('page1.html')} for i in range(2, 4): respx.get(url, path=f'/page{i}.html') % { 'html': page_content(f'page{i}.html') } backup_path = tmp_path / 'backup.mp' config = Configuration(item_processors=[self.processor], backup_filename=f'{backup_path}', follow_robots_txt=True) static_spider = StaticSpider(urls=[url], parse=self.parse, config=config) await static_spider.run() stats = static_spider.statistics() followed_urls = {f'{url}/page{i}.html' for i in range(2, 4)} assert stats.reachable_urls == {url} | followed_urls assert stats.followed_urls == followed_urls assert stats.request_counter == 3 assert stats.average_fetch_time > 0 await self.common_assert(stats, backup_path)
def tests_should_return_default_backup_filename_when_no_one_is_given( self, mocker): mocker.patch( 'uuid.uuid4', return_value=uuid.UUID('84a49591-c522-4a1c-971c-cf0282c6a759')) config = Configuration() assert 'backup-84a49591-c522-4a1c-971c-cf0282c6a759.mp' == config.backup_filename
def test_should_fetch_content_when_giving_http_url(self, mocker): parse_args = [] url = 'http://foo.com' def parse(sel_spider, response): parse_args.extend([sel_spider, response]) respx.get(f'{url}/robots.txt', status_code=404) mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get') mocker.patch( 'selenium.webdriver.remote.webdriver.WebDriver.current_window_handle', 'handle') config = Configuration(follow_robots_txt=True, selenium_driver_log_file=None) spider = SeleniumSpider(urls=[url], parse=parse, config=config) spider._handle_url(url) assert parse_args[0] is spider selenium_response = parse_args[1] assert isinstance(selenium_response, SeleniumResponse) assert selenium_response.driver is spider._driver assert 'handle' == selenium_response.handle assert {url} == spider.reachable_urls assert set() == spider.unreachable_urls assert 1 == spider.request_counter assert spider._total_fetch_time > 0 # cleanup spider._cleanup()
def test_should_save_correct_output_when_giving_file_url( self, page_1_file_url, tmp_path, browser): backup_path = tmp_path / 'backup.mp' config = Configuration(item_processors=[self.processor], backup_filename=f'{backup_path}', selenium_driver_log_file=None, selenium_browser=browser) spider = SeleniumSpider(urls=[page_1_file_url], parse=self.parse, config=config) spider.run() stats = spider.statistics() followed_urls = { page_1_file_url.replace('1', '2'), page_1_file_url.replace('1', '3') } assert followed_urls == stats.followed_urls assert {page_1_file_url} | followed_urls == stats.reachable_urls assert 3 == stats.request_counter assert stats.total_time > 0 assert stats.average_fetch_time == spider._total_fetch_time / stats.request_counter assert set() == stats.unreachable_urls assert set() == stats.robot_excluded_urls assert stats.total_time > 0 albert_count = 0 for item in read_mp(backup_path, decoder=datetime_decoder): assert isinstance(item['date'], datetime) if item['author'] == 'Albert Einstein': print(item) albert_count += 1 assert albert_count == 3
def test_should_read_file_content_when_giving_a_file_url(self, tmp_path): parse_args = [] hello_file = tmp_path / 'hello.txt' hello_file.write_text('Hello world!') file_url = hello_file.resolve().as_uri() def parse(sel_spider, response): parse_args.extend([sel_spider, response]) spider = SeleniumSpider( urls=[file_url], parse=parse, config=Configuration(selenium_driver_log_file=None)) spider._handle_url(file_url) assert parse_args[0] is spider sel_response = parse_args[1] assert isinstance(sel_response, SeleniumResponse) assert '<body><pre>Hello world!</pre></body>' in sel_response.driver.page_source assert {file_url} == spider.reachable_urls assert set() == spider.unreachable_urls assert 1 == spider.request_counter assert spider._total_fetch_time > 0 # cleanup spider._cleanup()
def test_should_not_raise_error_when_giving_correct_path(self, tmp_path): try: Configuration(robots_cache_folder=tmp_path) except (FileNotFoundError, PermissionError) as e: pytest.fail(f'unexpected error when instantiating Configuration with robots_cache_folder: {e}') p = tmp_path / 'dummy_file' assert not p.exists()
def test_should_not_raise_error_when_value_is_a_compatible_string( self, str_browser, browser): try: config = Configuration(selenium_browser=str_browser) assert browser is config.selenium_browser except ValueError: pytest.fail( 'unexpected error when setting selenium browser attribute')
def test_should_convert_string_to_callable_list(self, math_module): config = Configuration( item_processors='custom_math.add, custom_math.minus', response_middlewares='custom_math.add:custom_math.minus' ) assert [math_module.add, math_module.minus] == config.response_middlewares assert [math_module.add, math_module.minus] == config.item_processors
def test_should_not_raise_error_when_value_is_a_browser_enum_member( self, value): try: config = Configuration(selenium_browser=value) assert value is config.selenium_browser except ValueError: pytest.fail( 'unexpected error when setting selenium browser attribute')
def test_should_not_raise_error_when_msgpack_encoder_or_decoder_is_a_callable( self, parameter): try: Configuration(**parameter) except Exception as e: pytest.fail( f'unexpected error when instantiating msgpack encoder or decoder: {e}' )
def test_should_return_correct_config_when_given_correct_toml_file(self, tmp_path): toml_file = tmp_path / 'settings.toml' lines = """ [scalpel] foo = "bar" user_agent = "Mozilla/5.0" fetch_timeout = 4.0 follow_robots_txt = true """ toml_file.write_text(lines) expected_config = Configuration(fetch_timeout=4.0, user_agent='Mozilla/5.0', follow_robots_txt=True) for item in [f'{toml_file}', toml_file]: config = Configuration.load_from_toml(item) assert expected_config.fetch_timeout == config.fetch_timeout assert expected_config.user_agent == config.user_agent assert expected_config.follow_robots_txt == config.follow_robots_txt
def test_should_return_correct_config_when_given_correct_yaml_file(self, tmp_path): lines = """--- scalpel: fetch_timeout: 4.0 user_agent: Mozilla/5.0 follow_robots_txt: true foo: bar """ yaml_file = tmp_path / 'settings.yml' yaml_file.write_text(lines) expected_config = Configuration(fetch_timeout=4.0, user_agent='Mozilla/5.0', follow_robots_txt=True) for item in [f'{yaml_file}', yaml_file]: config = Configuration.load_from_yaml(item) assert expected_config.fetch_timeout == config.fetch_timeout assert expected_config.user_agent == config.user_agent assert expected_config.follow_robots_txt == config.follow_robots_txt
def test_default_value_is_a_string_when_fake_user_agent_fails(self, mocker): class FailUserAgent: def __init__(self): raise FakeUserAgentError mocker.patch('scalpel.core.config.UserAgent', new=FailUserAgent) config = Configuration() assert config.user_agent.startswith('Mozilla/5.0')
async def test_should_return_robots_txt_value_when_follow_robots_txt_is_true( self, robots_content, value): url = 'http://foo.com' respx.get(f'{url}/robots.txt', content=f'User-agent:*\n{robots_content}') static_spider = StaticSpider( urls=[url], parse=lambda x, y: None, config=Configuration(follow_robots_txt=True)) assert value == await static_spider._get_request_delay(url)
async def test_should_return_config_delay_when_follow_robots_txt_is_false( self): url = 'http://foo.com' request = respx.get(f'{url}/robots.txt', content='User-agent:*\nDisallow: ') config = Configuration(min_request_delay=3, max_request_delay=3) static_spider = StaticSpider(urls=[url], parse=lambda x, y: None, config=config) assert not request.called assert 3 == await static_spider._get_request_delay(url)
def test_specific_static_attributes_are_correctly_instantiated(self): config = Configuration(user_agent='mozilla/5.0') spider = StaticSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) assert isinstance(spider._start_time, float) assert isinstance(spider._http_client, httpx.Client) assert isinstance(spider._robots_analyser, RobotsAnalyzer) assert config == spider._config assert isinstance(spider._lock, RLock) assert isinstance(spider._queue, JoinableQueue) assert len(spider.urls) == spider._queue.qsize() assert isinstance(spider._pool, Pool)
def test_should_instantiate_correctly_driver_attribute( self, browser, name): config = Configuration(selenium_browser=browser, selenium_driver_log_file=None) spider = self.CustomSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) assert isinstance(spider.driver, WebDriver) assert name == spider.driver.name # cleanup spider.driver.quit()
async def test_should_exclude_url_when_robots_txt_excludes_it(self): url = 'http://foo.com' respx.get(f'{url}/robots.txt') % 401 async def parse(*_) -> None: pass static_spider = StaticSpider( urls=[url], parse=parse, config=Configuration(follow_robots_txt=True)) await static_spider.run() assert static_spider.reachable_urls == set() assert static_spider.robots_excluded_urls == {url}
async def test_should_do_nothing_if_url_is_already_present_in_one_url_set( self, mocker, reachable_urls, unreachable_urls, robots_excluded_urls ): url = 'http://foo.com' logger_mock = mocker.patch('logging.Logger.debug') config = Configuration(selenium_driver_log_file=None) spider = SeleniumSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) spider.reachable_urls = reachable_urls spider.unreachable_urls = unreachable_urls spider.robots_excluded_urls = robots_excluded_urls await spider._handle_url(url) logger_mock.assert_any_call('url %s has already been processed', url)
async def test_should_return_selenium_response_when_giving_correct_input(self, browser, handle): config = Configuration(selenium_driver_log_file=None, selenium_browser=browser) spider = SeleniumSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) response = spider._get_selenium_response(handle) assert isinstance(response, SeleniumResponse) assert response.driver is spider._driver assert response.handle == handle assert response._reachable_urls == spider.reachable_urls assert response._followed_urls == spider.followed_urls assert response._queue == spider._queue # cleanup await spider._cleanup()
async def test_selenium_attributes_are_correctly_instantiated(self): config = Configuration(selenium_driver_log_file=None) spider = SeleniumSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) assert isinstance(spider._driver, WebDriver) assert isinstance(spider._start_time, float) assert isinstance(spider._http_client, httpx.AsyncClient) assert isinstance(spider._robots_analyser, RobotsAnalyzer) assert config == spider._config assert isinstance(spider._lock, trio.Lock) assert isinstance(spider._queue, Queue) # cleanup await spider._cleanup()
def test_should_return_correct_config_given_correct_env_file(self, tmp_path, math_module): env_file = tmp_path / '.env' lines = """ FOO = BAR SCALPEL_FOLLOW_ROBOTS_TXT = yes SCALPEL_FETCH_TIMEOUT = 2.0 SCALPEL_RESPONSE_MIDDLEWARES = custom_math.add:custom_math.minus """ env_file.write_text(lines) config = Configuration.load_from_dotenv(env_file) assert config.follow_robots_txt is True assert 2.0 == config.fetch_timeout assert [math_module.add, math_module.minus] == config.response_middlewares
def test_should_return_non_empty_dict_when_scalpel_attributes_found(self): data = { 'name': 'paul', 'scalpel': { 'min_request_delay': 1, 'foo': 'bar', 'USER_AGENT': 'Mozilla/5.0', 'fruit': 'pineapple', '_config': 'foobar' } } expected = {'min_request_delay': 1, 'user_agent': 'Mozilla/5.0'} assert_dicts(expected, Configuration._scalpel_attributes(data))
def test_should_not_raise_error_when_msgpack_encoder_or_decoder_is_a_string_representing_a_callable( self, math_module): config = None try: # ok, the functions don't look as normal msgpack encoder or decoder, but it is just to test that the feature # works as expected config = Configuration(msgpack_encoder='custom_math.add', msgpack_decoder='custom_math.minus') except Exception as e: pytest.fail( f'unexpected error when instantiating msgpack encoder or decoder: {e}' ) assert math_module.add is config.msgpack_encoder assert math_module.minus is config.msgpack_decoder