Esempio n. 1
0
    async def test_should_work_with_file_url(self, page_1_file_url, tmp_path):
        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}')
        static_spider = StaticSpider(urls=[page_1_file_url],
                                     parse=self.parse,
                                     config=config)
        await static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {
            page_1_file_url.replace('1', '2').replace('///', '/'),
            page_1_file_url.replace('1', '3').replace('///', '/')
        }

        assert stats.reachable_urls == {page_1_file_url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert 3 == stats.request_counter
        assert stats.average_fetch_time == static_spider._total_fetch_time / stats.request_counter
        await self.common_assert(stats, backup_path)
Esempio n. 2
0
    async def test_should_return_robots_txt_value_when_follow_robots_txt_is_true(
            self, robots_content, value):
        url = 'http://foo.com'
        respx.get(f'{url}/robots.txt',
                  content=f'User-agent:*\n{robots_content}')
        static_spider = StaticSpider(
            urls=[url],
            parse=lambda x, y: None,
            config=Configuration(follow_robots_txt=True))

        assert value == await static_spider._get_request_delay(url)
Esempio n. 3
0
    async def test_should_return_config_delay_when_follow_robots_txt_is_false(
            self):
        url = 'http://foo.com'
        request = respx.get(f'{url}/robots.txt',
                            content='User-agent:*\nDisallow: ')
        config = Configuration(min_request_delay=3, max_request_delay=3)
        static_spider = StaticSpider(urls=[url],
                                     parse=lambda x, y: None,
                                     config=config)

        assert not request.called
        assert 3 == await static_spider._get_request_delay(url)
Esempio n. 4
0
    async def test_specific_static_attributes_are_correctly_instantiated(self):
        config = Configuration(user_agent='mozilla/5.0')
        spider = StaticSpider(urls=['http://foo.com'],
                              parse=lambda x, y: None,
                              config=config)

        assert config == spider._config
        assert isinstance(spider._lock, trio.Lock)
        assert isinstance(spider._queue, Queue)
        assert isinstance(spider._start_time, float)
        assert isinstance(spider._http_client, httpx.AsyncClient)
        assert isinstance(spider._robots_analyser, RobotsAnalyzer)
Esempio n. 5
0
    async def test_should_return_correct_statistics_after_running_spider(self):
        url1 = 'http://foo.com'
        respx.get(url1)
        respx.get(f'{url1}/robots.txt', status_code=404)

        async def parse(*_) -> None:
            pass

        static_spider = StaticSpider(
            urls=[url1],
            parse=parse,
            config=Configuration(follow_robots_txt=True))
        await static_spider.run()
        stats = static_spider.statistics()

        assert stats.reachable_urls == {url1}
        assert stats.unreachable_urls == set()
        assert stats.followed_urls == set()
        assert stats.robot_excluded_urls == set()
        assert stats.request_counter == 1
        assert stats.total_time > 0
        assert stats.average_fetch_time > 0
Esempio n. 6
0
    async def test_should_exclude_url_when_robots_txt_excludes_it(self):
        url = 'http://foo.com'
        respx.get(f'{url}/robots.txt', status_code=401)

        async def parse(*_) -> None:
            pass

        static_spider = StaticSpider(
            urls=[url],
            parse=parse,
            config=Configuration(follow_robots_txt=True))
        await static_spider.run()
        assert static_spider.reachable_urls == set()
        assert static_spider.robots_excluded_urls == {url}
Esempio n. 7
0
    async def test_should_not_raise_error_if_parse_function_raises_error_and_ignore_errors_is_true(
            self):
        async def parse(*_):
            raise ValueError('just a test')

        url = 'http://foo.com'
        respx.get(url)
        static_spider = StaticSpider(urls=[url],
                                     parse=parse,
                                     ignore_errors=True)

        try:
            await static_spider._handle_url(url)
        except ValueError:
            pytest.fail('ValueError was raised and it should not happen')
Esempio n. 8
0
    async def test_should_raise_errors_if_parse_function_raises_error_and_ignore_errors_is_false(
            self):
        async def parse(*_):
            raise ValueError('just a test')

        url = 'http://foo.com'
        respx.get(url)
        static_spider = StaticSpider(urls=[url],
                                     parse=parse,
                                     ignore_errors=False)

        with pytest.raises(ValueError) as exc_info:
            await static_spider._handle_url(url)

        assert 'just a test' == str(exc_info.value)
Esempio n. 9
0
    async def test_should_work_with_http_url(self, page_content, tmp_path):
        url = 'http://quotes.com'
        respx.get(f'{url}/robots.txt', status_code=404)
        respx.get(url, content=page_content('page1.html'))
        for i in range(2, 4):
            respx.get(f'{url}/page{i}.html',
                      content=page_content(f'page{i}.html'))

        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}',
                               follow_robots_txt=True)
        static_spider = StaticSpider(urls=[url],
                                     parse=self.parse,
                                     config=config)
        await static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {f'{url}/page{i}.html' for i in range(2, 4)}

        assert stats.reachable_urls == {url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert stats.request_counter == 3
        assert stats.average_fetch_time > 0
        await self.common_assert(stats, backup_path)
Esempio n. 10
0
    async def test_should_not_called_parse_method_when_file_cannot_be_opened(
            self, tmp_path, mocker):
        logger_mock = mocker.patch('logging.Logger.exception')
        hello_file = tmp_path / 'hello.txt'
        file_url = hello_file.resolve().as_uri()
        parse_args = []

        async def parse(spider, response):
            parse_args.extend([spider, response])

        static_spider = StaticSpider(urls=[file_url], parse=parse)
        await static_spider._handle_url(file_url)

        assert [] == parse_args
        logger_mock.assert_any_call('unable to open file %s', file_url)
        assert {file_url} == static_spider.unreachable_urls
Esempio n. 11
0
    async def test_should_not_called_parse_method_if_httpx_response_is_an_error_one(
            self, mocker, status_code):
        parse_args = []
        url = 'http://foo.com'

        def parse(spider, response):
            parse_args.extend([spider, response])

        respx.get(url, status_code=status_code)
        logger_mock = mocker.patch('logging.Logger.info')
        static_spider = StaticSpider(urls=[url], parse=parse)
        await static_spider._handle_url(url)

        assert [] == parse_args
        logger_mock.assert_any_call(
            'fetching url %s returns an error with status code %s', url,
            status_code)
Esempio n. 12
0
    async def test_should_fetch_content_when_giving_http_url(self):
        parse_args = []
        url = 'http://foo.com'

        async def parse(spider, response):
            parse_args.extend([spider, response])

        respx.get(url, status_code=200, content='http content')
        static_spider = StaticSpider(urls=[url], parse=parse)
        await static_spider._handle_url(url)

        assert parse_args[0] is static_spider
        static_response = parse_args[1]
        assert isinstance(static_response, StaticResponse)
        assert '' == static_response._url
        assert '' == static_response._text
        assert 200 == static_response._httpx_response.status_code
        assert 'http content' == static_response._httpx_response.text
        assert 1 == static_spider.request_counter
        assert static_spider._total_fetch_time > 0
Esempio n. 13
0
    async def test_should_save_content_to_backup_file(self, tmp_path, capsys):
        def processor(item):
            print("I'm a processor")
            return item

        backup = tmp_path / 'backup.mp'
        fruit_1 = {'fruit': 'pineapple'}
        fruit_2 = {'fruit': 'orange'}
        config = Configuration(backup_filename=f'{backup.resolve()}',
                               item_processors=[processor])
        static_spider = StaticSpider(urls=['https://foo.com'],
                                     parse=lambda x, y: None,
                                     config=config)
        await static_spider.save_item(fruit_1)
        await static_spider.save_item(fruit_2)
        out, _ = capsys.readouterr()

        assert [fruit_1, fruit_2
                ] == [item async for item in read_mp(f'{backup.resolve()}')]
        assert "I'm a processor" in out
Esempio n. 14
0
    async def test_should_read_file_content_when_giving_a_file_url(
            self, tmp_path):
        parse_args = []
        hello_file = tmp_path / 'hello.txt'
        hello_file.write_text('hello world')
        file_url = hello_file.resolve().as_uri()

        async def parse(spider, response):
            parse_args.extend([spider, response])

        static_spider = StaticSpider(urls=[file_url], parse=parse)
        await static_spider._handle_url(file_url)

        assert parse_args[0] is static_spider
        static_response = parse_args[1]
        assert isinstance(static_response, StaticResponse)
        assert file_url == static_response._url
        assert 'hello world' == static_response._text
        assert static_response._httpx_response is None
        assert {file_url} == static_spider.reachable_urls
Esempio n. 15
0
    async def test_middlewares_are_applied_when_fetching_resources(
            self, capsys):
        def log_middleware(fetch):
            async def wrapper(*args, **kwargs):
                print('before fetching')
                return await fetch(*args, **kwargs)

            print('after fetching')
            return wrapper

        url = 'http://foo.com'
        respx.get(url)
        config = Configuration(response_middlewares=[log_middleware])
        spider = StaticSpider(urls=[url],
                              parse=lambda x, y: None,
                              config=config)
        response = await spider._fetch(url)

        assert 200 == response.status_code
        out, _ = capsys.readouterr()
        assert 'before fetching' in out
        assert 'after fetching' in out
Esempio n. 16
0
    async def test_should_call_item_processors_and_reject_item_if_one_processor_returns_none(
            self, capsys, mocker):
        logger_mock = mocker.patch('logging.Logger.debug')
        data = {'banana': True}

        def processor_1(item):
            print("I'm a processor")
            return item

        async def processor_2(item):
            await trio.sleep(0)
            if 'banana' in item:
                return
            return item

        config = Configuration(item_processors=[processor_1, processor_2])
        static_spider = StaticSpider(urls=['http://foo.com'],
                                     parse=lambda x, y: None,
                                     config=config)
        await static_spider.save_item(data)

        logger_mock.assert_any_call('item %s was rejected', data)
        out, _ = capsys.readouterr()
        assert "I'm a processor" in out
Esempio n. 17
0
async def trio_spider():
    return StaticSpider(urls=[
        'http://foo.com', 'http://bar.com', 'file:///home/kevin/page.html'
    ],
                        parse=lambda x, y: None)