def test_get_next_request_should_return_next_request_with_higher_priority(
) -> None:
    high_priority_request = CrawlRequest('http://test.com', priority=1)
    crawler_configuration = CrawlerConfiguration(
        [request, high_priority_request])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.get_next_request() is high_priority_request
def test_str_should_return_string_representation() -> None:
    crawler_configuration = CrawlerConfiguration([CrawlRequest('https://example.com')],
                                                 filter_offsite_requests=True,
                                                 allowed_domains=['example.com'])

    assert str(crawler_configuration) == 'CrawlerConfiguration(seed_requests=1 requests, ' \
                                         'filter_duplicate_requests=True, ' \
                                         'filter_offsite_requests=True, ' \
                                         'allowed_domains=1 domains)'
def test_add_request_should_add_allowed_request_to_queue_when_offsite_request_filter_is_enabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [], filter_offsite_requests=True, allowed_domains=['example.com'])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    result = crawl_frontier.add_request(request)

    assert result is True
    assert crawl_frontier.get_next_request() is request
def test_add_request_should_add_duplicate_request_to_queue_when_duplicate_request_filter_is_disabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [request], filter_duplicate_requests=False)
    crawl_frontier = CrawlFrontier(crawler_configuration)
    crawl_frontier.get_next_request()

    result = crawl_frontier.add_request(request)

    assert result is True
    assert crawl_frontier.get_next_request() is request
def test_add_request_should_not_add_duplicate_request_to_queue_when_duplicate_request_filter_is_enabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [CrawlRequest(url='http://example.com/test?abc=def&ghi=jkl#fragment')])
    crawl_frontier = CrawlFrontier(crawler_configuration)
    crawl_frontier.get_next_request()

    result = crawl_frontier.add_request(
        CrawlRequest(url='http://example.com/test?ghi=jkl&abc=def'))

    assert result is False
    assert crawl_frontier.get_next_request() is None
def test_allowed_domains_should_return_empty_list_when_no_allowed_domains_specified() -> None:
    crawler_configuration = CrawlerConfiguration([])

    assert crawler_configuration.allowed_domains == []
def test_filter_offsite_requests_should_return_specified_value_when_specified() -> None:
    crawler_configuration = CrawlerConfiguration([], filter_offsite_requests=True)

    assert crawler_configuration.filter_offsite_requests is True
def test_filter_offsite_requests_should_return_default_value_when_not_specified() -> None:
    crawler_configuration = CrawlerConfiguration([])

    assert crawler_configuration.filter_offsite_requests is False
def test_filter_duplicate_requests_should_return_specified_value_when_specified() -> None:
    crawler_configuration = CrawlerConfiguration([], filter_duplicate_requests=False)

    assert crawler_configuration.filter_duplicate_requests is False
def test_seed_requests_should_return_seed_requests() -> None:
    seed_requests = [CrawlRequest('https://example.com')]
    crawler_configuration = CrawlerConfiguration(seed_requests)

    assert crawler_configuration.seed_requests is seed_requests
def test_constructor_should_raise_value_error_when_invalid_domain_in_allowed_domains() -> None:
    with pytest.raises(ValueError) as exc_info:
        CrawlerConfiguration([], allowed_domains=['example.invalid'])

    assert str(exc_info.value) == 'Could not extract a valid domain from example.invalid'
Exemple #12
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([
         CrawlRequest(first_page_url, success_func=self.on_first_page_response),
         CrawlRequest(second_page_url)
     ])
Exemple #13
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(first_page_url), CrawlRequest(second_page_url)])
Exemple #14
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([])
Exemple #15
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(request_url, headers={'foo': 'bar'})])
def test_get_next_request_should_return_none_when_queue_is_empty() -> None:
    crawler_configuration = CrawlerConfiguration([])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.get_next_request() is None
def test_has_next_request_should_return_true_when_queue_is_not_empty() -> None:
    crawler_configuration = CrawlerConfiguration([request])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.has_next_request() is True
def test_allowed_domains_should_return_domains_only() -> None:
    crawler_configuration = CrawlerConfiguration([], allowed_domains=['https://www.example.com:80/'])

    assert crawler_configuration.allowed_domains == ['www.example.com']
Exemple #19
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(redirect_origin_url)])
Exemple #20
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(request_url)])