def test_scrape_correct_request(self): item = ItemFactory(created_at_i=42) httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, responses=self._createPages(pages=2, hits=[item]), content_type="application/json") gen = Scraper().scrape(tag="test", since=42, until=43) gen.next() self.assertDictEqual( httpretty.last_request().querystring, { "numericFilters": ["created_at_i>42,created_at_i<43"], "tags": ["test"], "page": ["0"] }) gen.next() self.assertDictEqual( httpretty.last_request().querystring, { "numericFilters": ["created_at_i>42,created_at_i<43"], "tags": ["test"], "page": ["1"] })
def test_scrape(self): hits = [ItemFactory(created_at_i=42) for _ in range(2)] httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, responses=self._createPages(hits=hits), content_type="application/json") resp = list(Scraper().scrape(tag="test", since=42)) self.assertListEqual(hits, resp)
def test_scrape_all_fields_are_returned(self): item = ItemFactory(created_at_i=42) httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, responses=self._createPages(hits=[item]), content_type="application/json") resp = list(Scraper().scrape(tag="test", since=42)) self.assertItemsEqual(resp[0].keys(), ["objectID", "created_at_i", "title"])
def test_scrape_no_items(self): lastPage = ResponseFactory() lastPage["nbHits"] = 0 lastPage["hits"] = [] httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, body=json.dumps(lastPage), content_type="application/json") resp = list(Scraper().scrape(tag="test", since=42)) self.assertListEqual(resp, [])
def test_scrape_translate_fields(self): item = ItemFactory(created_at_i=42) httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, responses=self._createPages(hits=[item]), content_type="application/json") fields = {"test": "created_at_i"} resp = list(Scraper().scrape(tag="test", since=42, fields=fields)) self.assertItemsEqual(resp[0].keys(), ["test"])
def test_scrape_generator(self): hits = [ItemFactory(created_at_i=42) for _ in range(2)] httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, responses=self._createPages(pages=2, hits=hits), content_type="application/json") gen = Scraper().scrape(tag="test", since=42) resp = gen.next() self.assertEqual(resp, hits[0]) resp = gen.next() self.assertEqual(resp, hits[1])
def test_scrape_page_limit(self): hits = [ItemFactory(created_at_i=42) for _ in range(2)] pages = [ httpretty.Response(body=json.dumps(ResponseFactory(hits=hits))) ] lastPage = ResponseFactory() # Trick the scraper in thinking it reached the last page but there are more # items available. lastPage["nbHits"] = 3 lastPage["hits"] = [] # this needs to be empty pages.append(httpretty.Response(body=json.dumps(lastPage))) httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL, responses=pages, content_type="application/json") with self.assertRaises(TooManyItemsException): list(Scraper().scrape(tag="test", since=42))