def _handle_redirect(self, origin_crawl_request: CrawlRequest, origin_request: Request, response: Response): crawl_response = CrawlResponse(origin_crawl_request, response.status, response.headers, None) redirected_request = CrawlRequest( origin_request.url).merge(origin_crawl_request) self.crawl(redirected_request) origin_crawl_request.redirect_func(crawl_response, redirected_request) if origin_crawl_request.redirect_func \ else self.on_request_redirect(crawl_response, redirected_request)
def test_add_request_should_not_add_duplicate_request_to_queue_when_duplicate_request_filter_is_enabled( ) -> None: crawler_configuration = CrawlerConfiguration( [CrawlRequest(url='http://example.com/test?abc=def&ghi=jkl#fragment')]) crawl_frontier = CrawlFrontier(crawler_configuration) crawl_frontier.get_next_request() result = crawl_frontier.add_request( CrawlRequest(url='http://example.com/test?ghi=jkl&abc=def')) assert result is False assert crawl_frontier.get_next_request() is None
def _handle_response(self, request: CrawlRequest, response: Response): crawl_response = CrawlResponse(request, response.status, response.headers, syncer.sync(response.text())) if 200 <= response.status < 300: request.success_func( crawl_response ) if request.success_func else self.on_response_success( crawl_response) else: request.error_func( crawl_response ) if request.error_func else self.on_response_error(crawl_response)
def test_get_next_request_should_return_next_request_with_higher_priority( ) -> None: high_priority_request = CrawlRequest('http://test.com', priority=1) crawler_configuration = CrawlerConfiguration( [request, high_priority_request]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.get_next_request() is high_priority_request
def test_str_should_return_string_representation() -> None: crawler_configuration = CrawlerConfiguration([CrawlRequest('https://example.com')], filter_offsite_requests=True, allowed_domains=['example.com']) assert str(crawler_configuration) == 'CrawlerConfiguration(seed_requests=1 requests, ' \ 'filter_duplicate_requests=True, ' \ 'filter_offsite_requests=True, ' \ 'allowed_domains=1 domains)'
def test_seed_requests_should_return_seed_requests() -> None: seed_requests = [CrawlRequest('https://example.com')] crawler_configuration = CrawlerConfiguration(seed_requests) assert crawler_configuration.seed_requests is seed_requests
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(request_url)])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(redirect_origin_url)])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([ CrawlRequest(first_page_url, success_func=self.on_first_page_response), CrawlRequest(second_page_url) ])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(first_page_url), CrawlRequest(second_page_url)])
def on_first_response(self, _: CrawlResponse) -> None: assert self.crawl(CrawlRequest(second_page_url)) is True
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(request_url, headers={'foo': 'bar'})])
# you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from silene.crawl_frontier import CrawlFrontier from silene.crawl_request import CrawlRequest from silene.crawler_configuration import CrawlerConfiguration request = CrawlRequest(url='http://example.com') def test_add_request_should_add_duplicate_request_to_queue_when_duplicate_request_filter_is_disabled( ) -> None: crawler_configuration = CrawlerConfiguration( [request], filter_duplicate_requests=False) crawl_frontier = CrawlFrontier(crawler_configuration) crawl_frontier.get_next_request() result = crawl_frontier.add_request(request) assert result is True assert crawl_frontier.get_next_request() is request
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from silene.crawl_request import CrawlRequest from silene.crawl_response import CrawlResponse request = CrawlRequest('https://example.com') def test_request_should_return_request() -> None: assert CrawlResponse(request, 200, {}).request is request def test_status_should_return_response_status() -> None: assert CrawlResponse(request, 200, {}).status == 200 def test_headers_should_return_response_headers() -> None: headers = {'Content-Type': 'text/html; charset=utf-8'} assert CrawlResponse(request, 200, headers).headers == headers