def test_skip_downloads(settings): item, url, crawler = yield crawl_single_item(MySpider, ProductHtml, settings) assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is False assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 item, url, crawler = yield crawl_single_item(SkipDownloadSpider, ProductHtml, settings) assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is True assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1
def test_multi_args_callbacks(settings): item, _, _ = yield crawl_single_item(MultiArgsCallbackSpider, ProductHtml, settings) assert type(item['product']) == ProductPage assert type(item['provided']) == ProvidedAsyncTest assert item['cb_arg'] == "arg!" assert item['non_cb_arg'] == None
def test_providers_returning_wrong_classes(settings): """Injection Middleware should raise a runtime error whenever a provider returns instances of classes that they're not supposed to provide. """ with pytest.raises(AssertionError): yield crawl_single_item(spider_for(ExtraClassData), ProductHtml, settings)
def test_price_first_spider(settings): item, _, _ = yield crawl_single_item(PriceFirstMultiProviderSpider, ProductHtml, settings) assert item == { Price: Price("22€"), Name: Name("Chocolate"), Html: Html("Price Html!"), }
def test_optional_and_unions(settings): item, _, _ = yield crawl_single_item(spider_for(OptionalAndUnionPage), ProductHtml, settings) assert item['breadcrumbs'].response is item['response'] assert item['opt_check_1'] is item['breadcrumbs'] assert item['opt_check_2'] is None assert item['union_check_1'] is item['breadcrumbs'] assert item['union_check_2'] is item['breadcrumbs'].response assert item['union_check_3'] is None assert item['union_check_5'] is item['breadcrumbs']
def test_basic_case(settings): item, url, _ = yield crawl_single_item(spider_for(ProductPage), ProductHtml, settings) assert item == { 'url': url, 'name': 'Chocolate', 'price': '22€', 'description': 'The best chocolate ever', 'category': 'Food / Sweets', }
def test_overrides(settings): host = socket.gethostbyname(socket.gethostname()) domain = get_domain(host) settings["SCRAPY_POET_OVERRIDES"] = { domain: {BreadcrumbsExtraction: OverridenBreadcrumbsExtraction}} item, url, _ = yield crawl_single_item(spider_for(ProductPage), ProductHtml, settings) assert item == { 'url': url, 'name': 'Chocolate', 'price': '22€', 'description': 'The best chocolate ever', 'category': 'overriden_breadcrumb', }
def test_name_first_spider(settings, tmp_path): cache = tmp_path / "cache.sqlite3" settings.set("SCRAPY_POET_CACHE", str(cache)) item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, ProductHtml, settings) assert cache.exists() assert item == { Price: Price("22€"), Name: Name("Chocolate"), Html: Html("Name Html!"), "response_data_html": ProductHtml.html, } # Let's see that the cache is working. We use a different and wrong resource, # but it should be ignored by the cached version used item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, NonProductHtml, settings) assert item == { Price: Price("22€"), Name: Name("Chocolate"), Html: Html("Name Html!"), "response_data_html": ProductHtml.html, }
def test_scrapy_dependencies_on_providers(scrapy_class, settings): """Scrapy dependencies should be injected into Providers.""" @attr.s(auto_attribs=True) class PageData: scrapy_class: str class PageDataProvider(PageObjectInputProvider): provided_classes = {PageData} def __call__(self, to_provide, obj: scrapy_class): return [PageData(scrapy_class=scrapy_class.__name__)] @attr.s(auto_attribs=True) class Page(ItemWebPage): page_data: PageData def to_item(self): return { "scrapy_class": self.page_data.scrapy_class, } class MySpider(Spider): name = "my_spider" url = None custom_settings = { "SCRAPY_POET_PROVIDERS": { ResponseDataProvider: 1, PageDataProvider: 2, } } def start_requests(self): yield Request(url=self.url, callback=self.parse) def parse(self, response, page: Page): return page.to_item() item, url, crawler = yield crawl_single_item(MySpider, ProductHtml, settings) assert item["scrapy_class"] == scrapy_class.__name__
def test_scrapy_dependencies_on_providers(scrapy_class, settings): """Scrapy dependencies should be injected into Providers.""" @attr.s(auto_attribs=True) class PageData: scrapy_class: str @provides(PageData) class PageDataProvider(PageObjectInputProvider): def __init__(self, obj: scrapy_class): self.obj = obj def __call__(self): return PageData(scrapy_class=scrapy_class.__name__, ) @attr.s(auto_attribs=True) class Page(ItemWebPage): page_data: PageData def to_item(self): return { "scrapy_class": self.page_data.scrapy_class, } class MySpider(Spider): name = "my_spider" url = None def start_requests(self): yield Request(url=self.url, callback=self.parse) def parse(self, response, page: Page): return page.to_item() item, url, crawler = yield crawl_single_item(MySpider, ProductHtml, settings) assert item["scrapy_class"] == scrapy_class.__name__
def test_providers(settings): item, _, _ = yield crawl_single_item(spider_for(ProvidersPage), ProductHtml, settings) assert item['provided'].msg == "Provided 5!" assert item['provided'].response == None