def test_download_all(self): http_client = HttpClientStub() downloader = Downloader(http_client_factory=lambda: http_client) downloader.prepare(ArgsStub()) consumer = ConsumerStub() cache_consumer = ConsumerStub() urls = list(http_client.get_data().keys()) asyncio.run(downloader.download_all(urls, consumer)) asyncio.run(downloader.download_all(urls, cache_consumer)) # Verify that all pages were downloaded self.assertEqual(consumer.get_data(), http_client.get_data()) self.assertEqual(http_client.get_calls(), urls) # Verify that no more pages were downloaded self.assertEqual(cache_consumer.get_data(), http_client.get_data()) self.assertEqual(http_client.get_calls(), urls)
def test_qps(self): http_client = HttpClientStub() downloader = Downloader(http_client_factory=lambda: http_client) args = ArgsStub() args.qps = 1 downloader.prepare(args) consumer = ConsumerStub() urls = list(http_client.get_data().keys()) start = datetime.now() asyncio.run(downloader.download_all(urls, consumer)) end = datetime.now() # Verify that at qps = 1, we spent ~1 sec per download delta = timedelta(microseconds=500000) expected_duration = timedelta(seconds=len(urls) - 1) actual_duration = end - start self.assertLess(actual_duration, expected_duration + delta) self.assertGreater(actual_duration, expected_duration - delta)
def update_all(self): total_count = 0 for dataset in Downloader.download_all(): total_count += Importer.from_lines(dataset) print("Inserted ", total_count)
def test_download_all(mock_get_sources, mock_download): for _ in Downloader.download_all(): pass mock_download.assert_has_calls([call("source1"), call("source2")])