def test_scrape_correct_request(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL,
                               responses=self._createPages(pages=2, hits=[item]),
                               content_type="application/json")

        gen = Scraper().scrape(tag="test", since=42, until=43)

        gen.next()
        self.assertDictEqual(httpretty.last_request().querystring,
            {
              "numericFilters": ["created_at_i>42,created_at_i<43"],
              "tags": ["test"],
              "page": ["0"]
            }
        )

        gen.next()
        self.assertDictEqual(httpretty.last_request().querystring,
            {
              "numericFilters": ["created_at_i>42,created_at_i<43"],
              "tags": ["test"],
              "page": ["1"]
            }
        )
Example #2
0
    def test_translate_missing_field_multiple_objects(self):
        dummy_object = {"valid_field": 42, "missing_field": 21}
        dummy_object_with_missing_field = {"valid_field": 42}
        fields = {"expected_field": "missing_field"}

        with self.assertRaises(KeyError):
            Scraper._translateFields(
                {"hits": [dummy_object, dummy_object_with_missing_field]},
                fields=fields)
    def test_translate_missing_field(self):
        dummy_object = {
            "first_field": 42,
            "second_field": 21
        }
        fields = {
            "expected_field": "missing_field"
        }

        with self.assertRaises(KeyError):
            Scraper._translateFields({"hits": [dummy_object]}, fields=fields)
    def test_scrape_generator(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL,
                               responses=self._createPages(pages=2, hits=hits),
                               content_type="application/json")

        gen = Scraper().scrape(tag="test", since=42)
        resp = gen.next()
        self.assertEqual(resp, hits[0])
        resp = gen.next()
        self.assertEqual(resp, hits[1])
Example #5
0
    def test_scrape_generator(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(pages=2, hits=hits),
                               content_type="application/json")

        gen = Scraper().scrape(tag="test", since=42)
        resp = gen.next()
        self.assertEqual(resp, hits[0])
        resp = gen.next()
        self.assertEqual(resp, hits[1])
Example #6
0
 def test_translate_fields(self):
     dummy_object = {"first_field": 42, "second_field": 21}
     fields = {"changed1": "first_field", "changed2": "second_field"}
     expected = {"changed1": 42, "changed2": 21}
     translated_object = Scraper._translateFields({"hits": [dummy_object]},
                                                  fields=fields)[0]
     self.assertDictEqual(translated_object, expected)
    def test_translate_missing_field_multiple_objects(self):
        dummy_object = {
            "valid_field": 42,
            "missing_field": 21
        }
        dummy_object_with_missing_field = {
            "valid_field": 42
        }
        fields = {
            "expected_field": "missing_field"
        }

        with self.assertRaises(KeyError):
            Scraper._translateFields(
                    {"hits": [dummy_object, dummy_object_with_missing_field]},
                    fields=fields)
 def test_translate_fields_no_fields(self):
     dummy_object = {
         "first_field": 42,
         "second_field": 21
     }
     translated_object = Scraper._translateFields({"hits": [dummy_object]},
         fields=None)[0]
     self.assertDictEqual(translated_object, dummy_object)
Example #9
0
    def test_translate_fields_multiple_objects(self):
        NR_OBJECTS = 2

        dummy_object = {"first_field": 42, "second_field": 21}
        fields = {"changed1": "first_field", "changed2": "second_field"}
        expected = [{"changed1": 42, "changed2": 21}] * NR_OBJECTS
        translated_objects = Scraper._translateFields(
            {"hits": [dummy_object] * NR_OBJECTS}, fields=fields)
        self.assertItemsEqual(translated_objects, expected)
Example #10
0
    def test_scrape(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=hits),
                               content_type="application/json")

        resp = list(Scraper().scrape(tag="test", since=42))
        self.assertListEqual(hits, resp)
    def test_get_stories(self):
        hits = [StoryFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=hits),
                               content_type="application/json")

        resp = list(StoryScraper().getStories(since=42))
        expected = Scraper._translateFields({"hits": hits},
                StoryScraper.FIELDS)
        self.assertListEqual(resp, expected)
Example #12
0
    def test_scrape_all_fields_are_returned(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=[item]),
                               content_type="application/json")

        resp = list(Scraper().scrape(tag="test", since=42))
        self.assertItemsEqual(resp[0].keys(),
                              ["objectID", "created_at_i", "title"])
Example #13
0
    def test_scrape_no_items(self):
        lastPage = ResponseFactory()
        lastPage["nbHits"] = 0
        lastPage["hits"] = []

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               body=json.dumps(lastPage),
                               content_type="application/json")

        resp = list(Scraper().scrape(tag="test", since=42))
        self.assertListEqual(resp, [])
Example #14
0
    def test_get_stories(self):
        hits = [StoryFactory(created_at_i=42) for _ in range(2)]

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=hits),
                               content_type="application/json")

        resp = list(StoryScraper().getStories(since=42))
        expected = Scraper._translateFields({"hits": hits},
                                            StoryScraper.FIELDS)
        self.assertListEqual(resp, expected)
Example #15
0
    def test_scrape_translate_fields(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=[item]),
                               content_type="application/json")

        fields = {"test": "created_at_i"}

        resp = list(Scraper().scrape(tag="test", since=42, fields=fields))
        self.assertItemsEqual(resp[0].keys(), ["test"])
Example #16
0
    def test_scrape_correct_request(self):
        item = ItemFactory(created_at_i=42)

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=self._createPages(pages=2,
                                                           hits=[item]),
                               content_type="application/json")

        gen = Scraper().scrape(tag="test", since=42, until=43)

        gen.next()
        self.assertDictEqual(
            httpretty.last_request().querystring, {
                "numericFilters": ["created_at_i>42,created_at_i<43"],
                "tags": ["test"],
                "page": ["0"]
            })

        gen.next()
        self.assertDictEqual(
            httpretty.last_request().querystring, {
                "numericFilters": ["created_at_i>42,created_at_i<43"],
                "tags": ["test"],
                "page": ["1"]
            })
    def test_timeout(self):
        with patch(self.SOCK_SET_TIMEOUT_PATH) as set_timeout_mock:
            # The contents of the response and arguments of the method call
            # are irrelevant, the focus is setting the socket timeout
            httpretty.register_uri(httpretty.GET, AlgoliaEndpoint.URL,
                               responses=self._createPages(hits=[]),
                               content_type="application/json")
            timeout = 10
            # Force results retrieval (method is a generator)
            list(Scraper.scrape("comments", 0, 1, 0, timeout))

            self.assertEquals(set_timeout_mock.call_args[0][0], timeout,
                              "Timeout has not been set")
Example #18
0
    def test_timeout(self):
        with patch(self.SOCK_SET_TIMEOUT_PATH) as set_timeout_mock:
            # The contents of the response and arguments of the method call
            # are irrelevant, the focus is setting the socket timeout
            httpretty.register_uri(httpretty.GET,
                                   AlgoliaEndpoint.URL,
                                   responses=self._createPages(hits=[]),
                                   content_type="application/json")
            timeout = 10
            # Force results retrieval (method is a generator)
            list(Scraper.scrape("comments", 0, 1, 0, timeout))

            self.assertEquals(set_timeout_mock.call_args[0][0], timeout,
                              "Timeout has not been set")
 def test_translate_fields(self):
     dummy_object = {
         "first_field": 42,
         "second_field": 21
     }
     fields = {
         "changed1": "first_field",
         "changed2": "second_field"
     }
     expected = {
         "changed1": 42,
         "changed2": 21
     }
     translated_object = Scraper._translateFields({"hits": [dummy_object]},
         fields=fields)[0]
     self.assertDictEqual(translated_object, expected)
    def test_translate_fields_multiple_objects(self):
        NR_OBJECTS = 2

        dummy_object = {
            "first_field": 42,
            "second_field": 21
        }
        fields = {
            "changed1": "first_field",
            "changed2": "second_field"
        }
        expected = [{
            "changed1": 42,
            "changed2": 21
        }] * NR_OBJECTS
        translated_objects = Scraper._translateFields(
                {"hits": [dummy_object] * NR_OBJECTS}, fields=fields)
        self.assertItemsEqual(translated_objects, expected)
Example #21
0
    def test_scrape_page_limit(self):
        hits = [ItemFactory(created_at_i=42) for _ in range(2)]
        pages = [
            httpretty.Response(body=json.dumps(ResponseFactory(hits=hits)))
        ]

        lastPage = ResponseFactory()
        # Trick the scraper in thinking it reached the last page but there are more
        # items available.
        lastPage["nbHits"] = 3
        lastPage["hits"] = []  # this needs to be empty

        pages.append(httpretty.Response(body=json.dumps(lastPage)))

        httpretty.register_uri(httpretty.GET,
                               AlgoliaEndpoint.URL,
                               responses=pages,
                               content_type="application/json")

        with self.assertRaises(TooManyItemsException):
            list(Scraper().scrape(tag="test", since=42))
Example #22
0
 def test_translate_fields_no_fields(self):
     dummy_object = {"first_field": 42, "second_field": 21}
     translated_object = Scraper._translateFields({"hits": [dummy_object]},
                                                  fields=None)[0]
     self.assertDictEqual(translated_object, dummy_object)
Example #23
0
    def test_translate_missing_field(self):
        dummy_object = {"first_field": 42, "second_field": 21}
        fields = {"expected_field": "missing_field"}

        with self.assertRaises(KeyError):
            Scraper._translateFields({"hits": [dummy_object]}, fields=fields)