def scraper_result_test(self, results, response_times): with mock.patch('aiohttp.ClientSession') as session: urls = results['url'] results_tasks = make_test_tasks(results, response_times) # get the mock obj returned by aoihttp.ClientSession() session_mock = session.return_value session_mock.get.side_effect = results_tasks # mock the output table format scraper expects out_table = pd.DataFrame(data={ 'url': urls, 'status': '' }, columns=['url', 'status', 'html']) event_loop = asyncio.get_event_loop() event_loop.run_until_complete(scrape_urls(urls, out_table)) # ensure aiohttp.get() called with the right sequence of urls valid_urls = [x for x in urls if is_valid_url(x)] call_urls = [] for call in session_mock.get.mock_calls: name, args, kwargs = call call_urls.append(args[0]) self.assertEqual(call_urls, valid_urls) # ensure we saved the right results self.assertTrue(out_table['status'].equals(results['status'])) self.assertTrue(out_table['html'].equals(results['html']))
def scraper_result_test(self, results, response_times): async def session_get(url, *, timeout=None): url = str(url) # undo yarl un-magick-ing # Silly mock HTTP GET computes the test's input based on its # expected output. This defeats the purpose of a test. row = results[results["url"] == url] if row.empty: raise ValueError("called with URL we did not expect") index = row.index[0] delay = response_times[index] await asyncio.sleep(delay) status = row.at[index, "status"] text = row.at[index, "html"] if status == "Timed out": raise asyncio.TimeoutError elif status == "Invalid URL": raise aiohttp.InvalidURL(url) elif status == "Can't connect: blah": raise aiohttp.client_exceptions.ClientConnectionError("blah") else: return MockResponse(int(status), text) with patch("aiohttp.ClientSession") as session: urls = results["url"].tolist() session_mock = session.return_value session_mock.get.side_effect = session_get # mock the output table format scraper expects out_table = pd.DataFrame(data={ "url": urls, "status": "" }, columns=["url", "status", "html"]) event_loop = asyncio.get_event_loop() event_loop.run_until_complete( urlscraper.scrape_urls(urls, out_table)) assert_frame_equal(out_table[["url", "status", "html"]], results[["url", "status", "html"]]) # ensure aiohttp.get() called with the right sequence of urls # str() to un-magick the yarl.URL() magic call_urls = [ str(args[0]) for name, args, kwargs in session_mock.get.mock_calls ] self.assertEqual(set(call_urls), set(urls))
def scraper_result_test(self, results, response_times): async def session_get(url, *, timeout=None): # Silly mock HTTP GET computes the test's input based on its # expected output. This defeats the purpose of a test. row = results[results['url'] == url] if row.empty: raise ValueError('called with URL we did not expect') index = row.index[0] delay = response_times[index] await asyncio.sleep(delay) status = row.at[index, 'status'] text = row.at[index, 'html'] if status == 'Timed out': raise asyncio.TimeoutError elif status == 'Invalid URL': raise aiohttp.InvalidURL(url) elif status == "Can't connect: blah": raise aiohttp.client_exceptions.ClientConnectionError('blah') else: return MockResponse(int(status), text) with patch('aiohttp.ClientSession') as session: urls = results['url'].tolist() session_mock = session.return_value session_mock.get.side_effect = session_get # mock the output table format scraper expects out_table = pd.DataFrame(data={ 'url': urls, 'status': '' }, columns=['url', 'status', 'html']) event_loop = asyncio.get_event_loop() event_loop.run_until_complete(scrape_urls(urls, out_table)) assert_frame_equal(out_table[['url', 'status', 'html']], results[['url', 'status', 'html']]) # ensure aiohttp.get() called with the right sequence of urls call_urls = [ args[0] for name, args, kwargs in session_mock.get.mock_calls ] self.assertEqual(set(call_urls), set(urls))