Esempio n. 1
0
def test_get_next_request_should_return_next_request_with_higher_priority(
) -> None:
    high_priority_request = CrawlRequest('http://test.com', priority=1)
    crawler_configuration = CrawlerConfiguration(
        [request, high_priority_request])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.get_next_request() is high_priority_request
Esempio n. 2
0
def test_add_request_should_add_allowed_request_to_queue_when_offsite_request_filter_is_enabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [], filter_offsite_requests=True, allowed_domains=['example.com'])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    result = crawl_frontier.add_request(request)

    assert result is True
    assert crawl_frontier.get_next_request() is request
Esempio n. 3
0
def test_add_request_should_add_duplicate_request_to_queue_when_duplicate_request_filter_is_disabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [request], filter_duplicate_requests=False)
    crawl_frontier = CrawlFrontier(crawler_configuration)
    crawl_frontier.get_next_request()

    result = crawl_frontier.add_request(request)

    assert result is True
    assert crawl_frontier.get_next_request() is request
Esempio n. 4
0
def test_add_request_should_not_add_duplicate_request_to_queue_when_duplicate_request_filter_is_enabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [CrawlRequest(url='http://example.com/test?abc=def&ghi=jkl#fragment')])
    crawl_frontier = CrawlFrontier(crawler_configuration)
    crawl_frontier.get_next_request()

    result = crawl_frontier.add_request(
        CrawlRequest(url='http://example.com/test?ghi=jkl&abc=def'))

    assert result is False
    assert crawl_frontier.get_next_request() is None
Esempio n. 5
0
    def __init__(self, crawl_frontier: CrawlFrontier = None) -> None:
        """
        Creates a new crawler instance.

        :param crawl_frontier: a crawl frontier instance (optional)
        """

        self._configuration: CrawlerConfiguration = self.configure()
        self._crawl_frontier: CrawlFrontier = crawl_frontier or CrawlFrontier(
            self._configuration)
        self._running: bool = False
        self._stop_initiated: bool = False
        self._browser: Optional[Browser] = None
        self._page: Optional[Page] = None
        self._page_index: Optional[int] = None
        self._next_request: Optional[CrawlRequest] = None
        self._send_head_request: bool = False
        self._aborted_request: bool = False
        self._last_request: Optional[Request] = None
        self._last_response: Optional[Response] = None
Esempio n. 6
0
def test_get_next_request_should_return_none_when_queue_is_empty() -> None:
    crawler_configuration = CrawlerConfiguration([])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.get_next_request() is None
Esempio n. 7
0
def test_has_next_request_should_return_true_when_queue_is_not_empty() -> None:
    crawler_configuration = CrawlerConfiguration([request])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.has_next_request() is True