Example #1
0
    def test_build_instances_methods(self, injector):
        def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls,
                     d: ClsNoProviderRequired):
            pass

        response = get_response_for_testing(callback)
        request = response.request
        plan = injector.build_plan(response.request)
        instances = yield from injector.build_instances(
            request, response, plan)
        assert instances == {
            Cls1: Cls1(),
            Cls2: Cls2(),
            WrapCls: WrapCls(ClsReqResponse()),
            ClsReqResponse: ClsReqResponse(),
            ClsNoProviderRequired: ClsNoProviderRequired()
        }

        instances = yield from injector.build_instances_from_providers(
            request, response, plan)
        assert instances == {
            Cls1: Cls1(),
            Cls2: Cls2(),
            ClsReqResponse: ClsReqResponse(),
        }
    def test_on_exception(self, provided_cls: AutoExtractData):
        class Provider(AutoExtractProvider):
            async def do_request(self, *args, agg_stats: AggStats, **kwargs):
                agg_stats.n_attempts += 3
                agg_stats.n_billable_query_responses += 2
                raise Exception()

        def callback(item: provided_cls):
            pass

        page_type = provided_cls.page_type
        injector = get_injector_for_testing({Provider: 500})
        response = get_response_for_testing(callback)
        with pytest.raises(Exception) as exinf:
            yield injector.build_callback_dependencies(response.request,
                                                       response)
        stats = injector.crawler.stats
        expected = {
            f'autoextract/{page_type}/pages/count': 1,
            f'autoextract/{page_type}/pages/errors': 1,
            'autoextract/total/attempts/count': 3,
            'autoextract/total/attempts/billable': 2,
            'autoextract/total/pages/count': 1,
            'autoextract/total/pages/errors': 1,
            'autoextract/total/pages/errors/rest/Exception': 1
        }
        assert_stats(stats, expected)
    def test_on_query_error(self, provided_cls: AutoExtractData):
        page_type = provided_cls.page_type
        data = {"query": "The query", "error": "Download error"}

        class Provider(AutoExtractProvider):
            async def do_request(self, *args, agg_stats: AggStats, **kwargs):
                agg_stats.n_attempts += 3
                agg_stats.n_billable_query_responses += 2
                return [data]

        def callback(item: provided_cls):
            pass

        injector = get_injector_for_testing({Provider: 500})
        response = get_response_for_testing(callback)
        with pytest.raises(QueryError) as exinf:
            yield injector.build_callback_dependencies(response.request,
                                                       response)
        stats = injector.crawler.stats
        expected = {
            f'autoextract/{page_type}/pages/count': 1,
            f'autoextract/{page_type}/pages/errors': 1,
            'autoextract/total/attempts/count': 3,
            'autoextract/total/attempts/billable': 2,
            'autoextract/total/pages/count': 1,
            'autoextract/total/pages/errors': 1,
            'autoextract/total/pages/errors/query/Download error': 1
        }
        assert_stats(stats, expected)
        assert "Download error" in str(exinf.value)
        assert "The query" in str(exinf.value)
Example #4
0
    def test_overrides(self, providers, override_should_happen):
        domain = "example.com" if override_should_happen else "other-example.com"
        # The request domain is example.com, so overrides shouldn't be applied
        # when we configure them for domain other-example.com
        overrides = {
            domain: {
                PricePO: PriceInDollarsPO,
                EurDollarRate: OtherEurDollarRate
            }
        }
        registry = PerDomainOverridesRegistry(overrides)
        injector = get_injector_for_testing(providers,
                                            overrides_registry=registry)

        def callback(response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate):
            pass

        response = get_response_for_testing(callback)
        kwargs = yield from injector.build_callback_dependencies(
            response.request, response)
        kwargs_types = {key: type(value) for key, value in kwargs.items()}
        price_po = kwargs["price_po"]
        item = price_po.to_item()

        if override_should_happen:
            assert kwargs_types == {"price_po": PriceInDollarsPO, "rate_po": OtherEurDollarRate}
            # Note that OtherEurDollarRate don't have effect inside PriceInDollarsPO
            # because composability of overrides is forbidden
            assert item == {"price": 22 * 1.1, "currency": "$"}
        else:
            assert kwargs_types == {"price_po": PricePO, "rate_po": EurDollarRate}
            assert item == {"price": 22, "currency": "€"}
Example #5
0
    def test_is_scrapy_response_required(self, injector):
        def callback_no_1(response: DummyResponse, a: Cls1):
            pass

        response = get_response_for_testing(callback_no_1)
        assert not injector.is_scrapy_response_required(response.request)

        def callback_yes_1(response, a: Cls1):
            pass

        response = get_response_for_testing(callback_yes_1)
        assert injector.is_scrapy_response_required(response.request)

        def callback_yes_2(response: DummyResponse, a: ClsReqResponse):
            pass

        response = get_response_for_testing(callback_yes_2)
        assert injector.is_scrapy_response_required(response.request)
    async def test_on_cancellation(self, provided_cls: AutoExtractProductData):
        old_handler = signal.getsignal(SIGINT)
        signal.signal(SIGINT, lambda x, y: None)
        try:
            lock = asyncio.Lock()
            await lock.acquire()

            class Provider(AutoExtractProvider):
                async def do_request(self, *args, agg_stats: AggStats,
                                     **kwargs):
                    await lock.acquire()

            def callback(item: provided_cls):
                pass

            injector = get_injector_for_testing({Provider: 500})
            stats = injector.crawler.stats
            response = get_response_for_testing(callback)
            deferred = injector.build_callback_dependencies(
                response.request, response)
            build_callbacks_future = Deferred.asFuture(
                deferred, asyncio.get_event_loop())

            async def cancel_after(sleep):
                await asyncio.sleep(sleep)
                pid = os.getpid()
                try:
                    os.kill(pid, SIGINT)
                except KeyboardInterrupt:
                    # As an effect of the SIGINT killing the process might receive
                    # here a KeyboardInterrupt exception. This is Ok.
                    pass
                return CancelledError()

            result = await asyncio.gather(build_callbacks_future,
                                          cancel_after(0.05),
                                          return_exceptions=True)
            assert all([isinstance(r, CancelledError) for r in result])

            page_type = provided_cls.page_type
            expected_stats = {
                'autoextract/total/pages/count': 1,
                'autoextract/total/pages/cancelled': 1,
                'autoextract/total/pages/errors': 0,
                f'autoextract/{page_type}/pages/count': 1,
                f'autoextract/{page_type}/pages/cancelled': 1,
                f'autoextract/{page_type}/pages/errors': 0,
            }
            assert_stats(stats, expected_stats)

        finally:
            signal.signal(SIGINT, old_handler)
Example #7
0
    def test_build_instances_from_providers_respect_priorities(self, str_list):
        providers = {get_provider({str}, text): int(text) for text in str_list}
        injector = get_injector_for_testing(providers)

        def callback(response: DummyResponse, arg: str):
            pass

        response = get_response_for_testing(callback)
        plan = injector.build_plan(response.request)
        instances = yield from injector.build_instances_from_providers(
            response.request, response, plan)

        assert instances[str] == min(str_list)
Example #8
0
    def test_build_callback_dependencies(self, injector):
        def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls,
                     d: ClsNoProviderRequired):
            pass

        response = get_response_for_testing(callback)
        kwargs = yield from injector.build_callback_dependencies(
            response.request, response)
        kwargs_types = {key: type(value) for key, value in kwargs.items()}
        assert kwargs_types == {
            "a": Cls1,
            "b": Cls2,
            "c": WrapCls,
            "d": ClsNoProviderRequired
        }
Example #9
0
    def test_build_instances_from_providers_unexpected_return(self):
        class WrongProvider(get_provider({Cls1})):
            def __call__(self, to_provide):
                return super().__call__(to_provide) + [Cls2()]

        injector = get_injector_for_testing({WrongProvider: 0})

        def callback(response: DummyResponse, a: Cls1):
            pass

        response = get_response_for_testing(callback)
        plan = injector.build_plan(response.request)
        with pytest.raises(UndeclaredProvidedTypeError) as exinf:
            yield from injector.build_instances_from_providers(
                response.request, response, plan)

        assert "Provider" in str(exinf.value)
        assert "Cls2" in str(exinf.value)
        assert "Cls1" in str(exinf.value)
    def test_providers(self, provided_cls: AutoExtractProductData):
        page_type = provided_cls.page_type
        url, html = "http://example.com", "html_content"
        data_wo_html = {page_type: {"url": url}}
        data = {page_type: {"url": url}, "html": html}
        provider_wrapper = []

        class Provider(AutoExtractProvider):
            async def do_request(self, *args, agg_stats: AggStats, **kwargs):
                assert provider.aiohttp_session.connector.limit == 2020
                agg_stats.n_attempts += 3
                agg_stats.n_billable_query_responses += 2
                assert kwargs['api_key'] == "key"
                assert kwargs['endpoint'] == "url"
                assert kwargs['max_query_error_retries'] == 31415
                return [copy.deepcopy(data)]

        def callback(item: provided_cls):
            pass

        def callback_with_html(item: provided_cls, html: AutoExtractHtml):
            pass

        def callback_only_html(html: AutoExtractHtml):
            pass

        settings = {
            "AUTOEXTRACT_USER": "******",
            "AUTOEXTRACT_URL": "url",
            "AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES": 31415,
            "CONCURRENT_REQUESTS": 2020,
            "CONCURRENT_REQUESTS_PER_DOMAIN": 1980,
        }
        injector = get_injector_for_testing({Provider: 500}, settings)
        stats = injector.crawler.stats
        provider = injector.providers[-1]
        provider_wrapper.append(provider)
        assert provider.per_domain_semaphore.concurrency_per_slot == 1980

        #  - No HTML requested case -

        response = get_response_for_testing(callback)
        kwargs = yield injector.build_callback_dependencies(
            response.request, response)
        assert kwargs["item"].data == data_wo_html
        assert type(kwargs["item"]) is provided_cls
        expected_stats = {
            'autoextract/total/pages/count': 1,
            'autoextract/total/pages/success': 1,
            'autoextract/total/attempts/count': 3,
            'autoextract/total/attempts/billable': 2,
            f'autoextract/{page_type}/pages/count': 1,
            f'autoextract/{page_type}/pages/success': 1
        }
        assert_stats(stats, expected_stats)

        #  - Both HTML and item requested case -

        response = get_response_for_testing(callback_with_html)
        kwargs = yield injector.build_callback_dependencies(
            response.request, response)
        item, html_response = kwargs["item"], kwargs["html"]
        assert item.data == data_wo_html
        assert type(item) is provided_cls
        assert (html_response.url, html_response.html) == (url, html)
        assert type(html_response) is AutoExtractHtml
        expected_stats = {
            'autoextract/total/pages/count': 2,
            'autoextract/total/pages/success': 2,
            'autoextract/total/pages/html': 1,
            'autoextract/total/attempts/count': 6,
            'autoextract/total/attempts/billable': 4,
            f'autoextract/{page_type}/pages/count': 2,
            f'autoextract/{page_type}/pages/success': 2,
            f'autoextract/{page_type}/pages/html': 1,
        }
        assert_stats(stats, expected_stats)

        #  - Only HTML is requested case -

        injector.providers[0].page_type_class_for_html = provided_cls
        response = get_response_for_testing(callback_only_html)
        kwargs = yield injector.build_callback_dependencies(
            response.request, response)
        assert "item" not in kwargs
        html_response = kwargs["html"]
        assert (html_response.url, html_response.html) == (url, html)
        assert type(html_response) is AutoExtractHtml
        expected_stats = {
            'autoextract/total/pages/count': 3,
            'autoextract/total/pages/success': 3,
            'autoextract/total/pages/html': 2,
            'autoextract/total/attempts/count': 9,
            'autoextract/total/attempts/billable': 6,
            f'autoextract/{page_type}/pages/count': 3,
            f'autoextract/{page_type}/pages/success': 3,
            f'autoextract/{page_type}/pages/html': 2,
        }
        assert_stats(stats, expected_stats)
Example #11
0
def test_cache(tmp_path, cache_errors):
    """
    In a first run, the cache is empty, and two requests are done, one with exception.
    In the second run we should get the same result as in the first run. The
    behaviour for exceptions vary if caching errors is disabled.
    """

    def validate_instances(instances):
        assert instances[str] == "foo"
        assert instances[int] == 3
        assert instances[float] == 3.0

    providers = {
        get_provider_for_cache({str}, "str", content="foo"): 1,
        get_provider_for_cache({int, float}, "number", content=3): 2,
    }

    cache = tmp_path / "cache3.sqlite3"
    if cache.exists():
        print(f"Cache file {cache} already exists. Weird. Deleting")
        cache.unlink()
    settings = {"SCRAPY_POET_CACHE": cache,
                "SCRAPY_POET_CACHE_ERRORS": cache_errors}
    injector = get_injector_for_testing(providers, settings)
    assert cache.exists()

    def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: float):
        pass

    response = get_response_for_testing(callback)
    plan = injector.build_plan(response.request)
    instances = yield from injector.build_instances_from_providers(
        response.request, response, plan)

    validate_instances(instances)

    # Changing the request URL below would result in the following error:
    #   <twisted.python.failure.Failure builtins.ValueError: The URL is not from
    #   example.com>>
    response.request = Request.replace(response.request, url="http://willfail.page")
    with pytest.raises(ValueError):
        plan = injector.build_plan(response.request)
        instances = yield from injector.build_instances_from_providers(
            response.request, response, plan)

    # Different providers. They return a different result, but the cache data should prevail.
    providers = {
        get_provider_for_cache({str}, "str", content="bar", error=KeyError): 1,
        get_provider_for_cache({int, float}, "number", content=4, error=KeyError): 2,
    }
    injector = get_injector_for_testing(providers, settings)

    response = get_response_for_testing(callback)
    plan = injector.build_plan(response.request)
    instances = yield from injector.build_instances_from_providers(
        response.request, response, plan)

    validate_instances(instances)

    # If caching errors is disabled, then KeyError should be raised.
    Error = ValueError if cache_errors else KeyError
    response.request = Request.replace(response.request, url="http://willfail.page")
    with pytest.raises(Error):
        plan = injector.build_plan(response.request)
        instances = yield from injector.build_instances_from_providers(
            response.request, response, plan)