def test_get_next_request_should_return_next_request_with_higher_priority( ) -> None: high_priority_request = CrawlRequest('http://test.com', priority=1) crawler_configuration = CrawlerConfiguration( [request, high_priority_request]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.get_next_request() is high_priority_request
def test_add_request_should_add_allowed_request_to_queue_when_offsite_request_filter_is_enabled( ) -> None: crawler_configuration = CrawlerConfiguration( [], filter_offsite_requests=True, allowed_domains=['example.com']) crawl_frontier = CrawlFrontier(crawler_configuration) result = crawl_frontier.add_request(request) assert result is True assert crawl_frontier.get_next_request() is request
def test_add_request_should_add_duplicate_request_to_queue_when_duplicate_request_filter_is_disabled( ) -> None: crawler_configuration = CrawlerConfiguration( [request], filter_duplicate_requests=False) crawl_frontier = CrawlFrontier(crawler_configuration) crawl_frontier.get_next_request() result = crawl_frontier.add_request(request) assert result is True assert crawl_frontier.get_next_request() is request
def test_add_request_should_not_add_duplicate_request_to_queue_when_duplicate_request_filter_is_enabled( ) -> None: crawler_configuration = CrawlerConfiguration( [CrawlRequest(url='http://example.com/test?abc=def&ghi=jkl#fragment')]) crawl_frontier = CrawlFrontier(crawler_configuration) crawl_frontier.get_next_request() result = crawl_frontier.add_request( CrawlRequest(url='http://example.com/test?ghi=jkl&abc=def')) assert result is False assert crawl_frontier.get_next_request() is None
def __init__(self, crawl_frontier: CrawlFrontier = None) -> None: """ Creates a new crawler instance. :param crawl_frontier: a crawl frontier instance (optional) """ self._configuration: CrawlerConfiguration = self.configure() self._crawl_frontier: CrawlFrontier = crawl_frontier or CrawlFrontier( self._configuration) self._running: bool = False self._stop_initiated: bool = False self._browser: Optional[Browser] = None self._page: Optional[Page] = None self._page_index: Optional[int] = None self._next_request: Optional[CrawlRequest] = None self._send_head_request: bool = False self._aborted_request: bool = False self._last_request: Optional[Request] = None self._last_response: Optional[Response] = None
def test_get_next_request_should_return_none_when_queue_is_empty() -> None: crawler_configuration = CrawlerConfiguration([]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.get_next_request() is None
def test_has_next_request_should_return_true_when_queue_is_not_empty() -> None: crawler_configuration = CrawlerConfiguration([request]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.has_next_request() is True