Beispiel #1
0
    def test_scrape_correct_request(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(pages=2,
                                                           hits=[item]),
                               content_type="application/json")

        gen = Scraper().scrape(tag="test", since=42, until=43)

        gen.next()
        self.assertDictEqual(
            httpretty.last_request().querystring, {
                "numericFilters": ["created_at_i>42,created_at_i<43"],
                "tags": ["test"],
                "page": ["0"]
            })

        gen.next()
        self.assertDictEqual(
            httpretty.last_request().querystring, {
                "numericFilters": ["created_at_i>42,created_at_i<43"],
                "tags": ["test"],
                "page": ["1"]
            })
Beispiel #2
0
    def test_scrape(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=hits),
                               content_type="application/json")

        resp = list(Scraper().scrape(tag="test", since=42))
        self.assertListEqual(hits, resp)
Beispiel #3
0
    def test_scrape_all_fields_are_returned(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=[item]),
                               content_type="application/json")

        resp = list(Scraper().scrape(tag="test", since=42))
        self.assertItemsEqual(resp[0].keys(),
                              ["objectID", "created_at_i", "title"])
Beispiel #4
0
    def test_scrape_no_items(self):
        lastPage = ResponseFactory()
        lastPage["nbHits"] = 0
        lastPage["hits"] = []

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               body=json.dumps(lastPage),
                               content_type="application/json")

        resp = list(Scraper().scrape(tag="test", since=42))
        self.assertListEqual(resp, [])
Beispiel #5
0
    def test_scrape_translate_fields(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=[item]),
                               content_type="application/json")

        fields = {"test": "created_at_i"}

        resp = list(Scraper().scrape(tag="test", since=42, fields=fields))
        self.assertItemsEqual(resp[0].keys(), ["test"])
Beispiel #6
0
    def test_scrape_generator(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(pages=2, hits=hits),
                               content_type="application/json")

        gen = Scraper().scrape(tag="test", since=42)
        resp = gen.next()
        self.assertEqual(resp, hits[0])
        resp = gen.next()
        self.assertEqual(resp, hits[1])
Beispiel #7
0
    def test_scrape_page_limit(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]
        pages = [
            httpretty.Response(body=json.dumps(ResponseFactory(hits=hits)))
        ]

        lastPage = ResponseFactory()
        # Trick the scraper in thinking it reached the last page but there are more
        # items available.
        lastPage["nbHits"] = 3
        lastPage["hits"] = []  # this needs to be empty

        pages.append(httpretty.Response(body=json.dumps(lastPage)))

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=pages,
                               content_type="application/json")

        with self.assertRaises(TooManyItemsException):
            list(Scraper().scrape(tag="test", since=42))