Exemple #1
0
    def _handle_redirect(self, origin_crawl_request: CrawlRequest,
                         origin_request: Request, response: Response):
        crawl_response = CrawlResponse(origin_crawl_request, response.status,
                                       response.headers, None)
        redirected_request = CrawlRequest(
            origin_request.url).merge(origin_crawl_request)

        self.crawl(redirected_request)
        origin_crawl_request.redirect_func(crawl_response, redirected_request) if origin_crawl_request.redirect_func \
            else self.on_request_redirect(crawl_response, redirected_request)
def test_add_request_should_not_add_duplicate_request_to_queue_when_duplicate_request_filter_is_enabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [CrawlRequest(url='http://example.com/test?abc=def&ghi=jkl#fragment')])
    crawl_frontier = CrawlFrontier(crawler_configuration)
    crawl_frontier.get_next_request()

    result = crawl_frontier.add_request(
        CrawlRequest(url='http://example.com/test?ghi=jkl&abc=def'))

    assert result is False
    assert crawl_frontier.get_next_request() is None
Exemple #3
0
    def _handle_response(self, request: CrawlRequest, response: Response):
        crawl_response = CrawlResponse(request, response.status,
                                       response.headers,
                                       syncer.sync(response.text()))

        if 200 <= response.status < 300:
            request.success_func(
                crawl_response
            ) if request.success_func else self.on_response_success(
                crawl_response)
        else:
            request.error_func(
                crawl_response
            ) if request.error_func else self.on_response_error(crawl_response)
def test_get_next_request_should_return_next_request_with_higher_priority(
) -> None:
    high_priority_request = CrawlRequest('http://test.com', priority=1)
    crawler_configuration = CrawlerConfiguration(
        [request, high_priority_request])
    crawl_frontier = CrawlFrontier(crawler_configuration)

    assert crawl_frontier.get_next_request() is high_priority_request
def test_str_should_return_string_representation() -> None:
    crawler_configuration = CrawlerConfiguration([CrawlRequest('https://example.com')],
                                                 filter_offsite_requests=True,
                                                 allowed_domains=['example.com'])

    assert str(crawler_configuration) == 'CrawlerConfiguration(seed_requests=1 requests, ' \
                                         'filter_duplicate_requests=True, ' \
                                         'filter_offsite_requests=True, ' \
                                         'allowed_domains=1 domains)'
def test_seed_requests_should_return_seed_requests() -> None:
    seed_requests = [CrawlRequest('https://example.com')]
    crawler_configuration = CrawlerConfiguration(seed_requests)

    assert crawler_configuration.seed_requests is seed_requests
Exemple #7
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(request_url)])
Exemple #8
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(redirect_origin_url)])
Exemple #9
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([
         CrawlRequest(first_page_url, success_func=self.on_first_page_response),
         CrawlRequest(second_page_url)
     ])
Exemple #10
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(first_page_url), CrawlRequest(second_page_url)])
Exemple #11
0
 def on_first_response(self, _: CrawlResponse) -> None:
     assert self.crawl(CrawlRequest(second_page_url)) is True
Exemple #12
0
 def configure(self) -> CrawlerConfiguration:
     return CrawlerConfiguration([CrawlRequest(request_url, headers={'foo': 'bar'})])
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from silene.crawl_frontier import CrawlFrontier
from silene.crawl_request import CrawlRequest
from silene.crawler_configuration import CrawlerConfiguration

request = CrawlRequest(url='http://example.com')


def test_add_request_should_add_duplicate_request_to_queue_when_duplicate_request_filter_is_disabled(
) -> None:
    crawler_configuration = CrawlerConfiguration(
        [request], filter_duplicate_requests=False)
    crawl_frontier = CrawlFrontier(crawler_configuration)
    crawl_frontier.get_next_request()

    result = crawl_frontier.add_request(request)

    assert result is True
    assert crawl_frontier.get_next_request() is request

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from silene.crawl_request import CrawlRequest
from silene.crawl_response import CrawlResponse

request = CrawlRequest('https://example.com')


def test_request_should_return_request() -> None:
    assert CrawlResponse(request, 200, {}).request is request


def test_status_should_return_response_status() -> None:
    assert CrawlResponse(request, 200, {}).status == 200


def test_headers_should_return_response_headers() -> None:
    headers = {'Content-Type': 'text/html; charset=utf-8'}

    assert CrawlResponse(request, 200, headers).headers == headers