Beispiel #1
0
    def test_should_read_file_content_when_giving_a_file_url(self, tmp_path):
        parse_args = []
        hello_file = tmp_path / 'hello.txt'
        hello_file.write_text('Hello world!')
        file_url = hello_file.resolve().as_uri()

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        spider = SeleniumSpider(
            urls=[file_url],
            parse=parse,
            config=Configuration(selenium_driver_log_file=None))
        spider._handle_url(file_url)

        assert parse_args[0] is spider
        sel_response = parse_args[1]
        assert isinstance(sel_response, SeleniumResponse)
        assert '<body><pre>Hello world!</pre></body>' in sel_response.driver.page_source
        assert {file_url} == spider.reachable_urls
        assert set() == spider.unreachable_urls
        assert 1 == spider.request_counter
        assert spider._total_fetch_time > 0

        # cleanup
        spider._cleanup()
Beispiel #2
0
    def test_should_fetch_content_when_giving_http_url(self, mocker):
        parse_args = []
        url = 'http://foo.com'

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        respx.get(f'{url}/robots.txt', status_code=404)
        mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get')
        mocker.patch(
            'selenium.webdriver.remote.webdriver.WebDriver.current_window_handle',
            'handle')
        config = Configuration(follow_robots_txt=True,
                               selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=[url], parse=parse, config=config)
        spider._handle_url(url)

        assert parse_args[0] is spider
        selenium_response = parse_args[1]
        assert isinstance(selenium_response, SeleniumResponse)
        assert selenium_response.driver is spider._driver
        assert 'handle' == selenium_response.handle
        assert {url} == spider.reachable_urls
        assert set() == spider.unreachable_urls
        assert 1 == spider.request_counter
        assert spider._total_fetch_time > 0

        # cleanup
        spider._cleanup()
Beispiel #3
0
    def test_should_do_nothing_if_url_is_already_present_in_one_url_set(
        self, mocker, reachable_urls, unreachable_urls, robots_excluded_urls
    ):
        url = 'http://foo.com'
        logger_mock = mocker.patch('logging.Logger.debug')
        config = Configuration(selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=['http://bar.com'], parse=lambda x, y: None, config=config)
        spider.reachable_urls = reachable_urls
        spider.unreachable_urls = unreachable_urls
        spider.robots_excluded_urls = robots_excluded_urls
        spider._handle_url(url)

        logger_mock.assert_any_call('url %s has already been processed', url)

        # cleanup
        spider._cleanup()
Beispiel #4
0
    def test_should_not_raise_error_if_parse_function_raises_error_and_ignore_errors_is_true(self, mocker):
        def parse(*_):
            raise ValueError('simple error')

        url = 'http://foo.com'
        mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get')
        config = Configuration(selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=[url], parse=parse, config=config, ignore_errors=True)

        try:
            spider._handle_url(url)
        except ValueError:
            pytest.fail('unexpected ValueError raised when ignore_errors is set to true')

        # cleanup
        spider._cleanup()
Beispiel #5
0
    def test_should_raise_error_if_parse_function_raises_error_and_ignore_errors_is_false(self, mocker):
        def parse(*_):
            raise ValueError('simple error')

        url = 'http://foo.com'
        mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get')
        config = Configuration(selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=[url], parse=parse, config=config, ignore_errors=False)

        with pytest.raises(ValueError) as exc_info:
            spider._handle_url(url)

        assert 'simple error' == str(exc_info.value)

        # cleanup
        spider._cleanup()
Beispiel #6
0
    def test_should_not_called_parse_method_if_url_is_not_accessible(self, mocker):
        parse_args = []
        url = 'http://foo.com'

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        respx.get(f'{url}/robots.txt') % 404
        mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get', side_effect=WebDriverException)
        config = Configuration(follow_robots_txt=True, selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=[url], parse=parse, config=config)
        spider._handle_url(url)

        assert [] == parse_args
        assert {url} == spider.unreachable_urls
        assert set() == spider.reachable_urls
        assert 0 == spider.request_counter == spider._total_fetch_time

        # cleanup
        spider._cleanup()
Beispiel #7
0
    def test_should_not_call_parse_method_when_file_cannot_be_opened(self, mocker, tmp_path):
        logger_mock = mocker.patch('logging.Logger.exception')
        hello_file = tmp_path / 'hello.txt'
        file_url = hello_file.resolve().as_uri()
        parse_args = []

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        spider = SeleniumSpider(urls=[file_url], parse=parse, config=Configuration(selenium_driver_log_file=None))
        spider._handle_url(file_url)

        assert [] == parse_args
        logger_mock.assert_any_call(f'unable to open file {file_url}')
        assert {file_url} == spider.unreachable_urls
        assert set() == spider.reachable_urls
        assert 0 == spider.request_counter == spider._total_fetch_time

        # cleanup
        spider._cleanup()
Beispiel #8
0
    def test_should_not_called_parse_method_if_url_is_forbidden_by_robots_txt(self, mocker):
        parse_args = []
        url = 'http://foo.com'

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        respx.get(f'{url}/robots.txt') % 401
        logger_mock = mocker.patch('logging.Logger.info')
        config = Configuration(follow_robots_txt=True, selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=[url], parse=parse, config=config)
        spider._handle_url(url)

        assert [] == parse_args
        assert {url} == spider.robots_excluded_urls
        logger_mock.assert_any_call(
            'robots.txt rule has forbidden the processing of url %s or the url is not reachable', url
        )

        # cleanup
        spider._cleanup()