コード例 #1
0
    def scraper_result_test(self, results, response_times):
        with mock.patch('aiohttp.ClientSession') as session:
            urls = results['url']
            results_tasks = make_test_tasks(results, response_times)
            # get the mock obj returned by aoihttp.ClientSession()
            session_mock = session.return_value
            session_mock.get.side_effect = results_tasks

            # mock the output table format scraper expects
            out_table = pd.DataFrame(data={
                'url': urls,
                'status': ''
            },
                                     columns=['url', 'status', 'html'])

            event_loop = asyncio.get_event_loop()
            event_loop.run_until_complete(scrape_urls(urls, out_table))

            # ensure aiohttp.get() called with the right sequence of urls
            valid_urls = [x for x in urls if is_valid_url(x)]
            call_urls = []
            for call in session_mock.get.mock_calls:
                name, args, kwargs = call
                call_urls.append(args[0])
            self.assertEqual(call_urls, valid_urls)

            # ensure we saved the right results
            self.assertTrue(out_table['status'].equals(results['status']))
            self.assertTrue(out_table['html'].equals(results['html']))
コード例 #2
0
    def scraper_result_test(self, results, response_times):
        async def session_get(url, *, timeout=None):
            url = str(url)  # undo yarl un-magick-ing

            # Silly mock HTTP GET computes the test's input based on its
            # expected output. This defeats the purpose of a test.
            row = results[results["url"] == url]
            if row.empty:
                raise ValueError("called with URL we did not expect")
            index = row.index[0]
            delay = response_times[index]
            await asyncio.sleep(delay)

            status = row.at[index, "status"]
            text = row.at[index, "html"]

            if status == "Timed out":
                raise asyncio.TimeoutError
            elif status == "Invalid URL":
                raise aiohttp.InvalidURL(url)
            elif status == "Can't connect: blah":
                raise aiohttp.client_exceptions.ClientConnectionError("blah")
            else:
                return MockResponse(int(status), text)

        with patch("aiohttp.ClientSession") as session:
            urls = results["url"].tolist()
            session_mock = session.return_value
            session_mock.get.side_effect = session_get

            # mock the output table format scraper expects
            out_table = pd.DataFrame(data={
                "url": urls,
                "status": ""
            },
                                     columns=["url", "status", "html"])

            event_loop = asyncio.get_event_loop()
            event_loop.run_until_complete(
                urlscraper.scrape_urls(urls, out_table))

            assert_frame_equal(out_table[["url", "status", "html"]],
                               results[["url", "status", "html"]])

            # ensure aiohttp.get() called with the right sequence of urls
            # str() to un-magick the yarl.URL() magic
            call_urls = [
                str(args[0])
                for name, args, kwargs in session_mock.get.mock_calls
            ]
            self.assertEqual(set(call_urls), set(urls))
コード例 #3
0
    def scraper_result_test(self, results, response_times):
        async def session_get(url, *, timeout=None):
            # Silly mock HTTP GET computes the test's input based on its
            # expected output. This defeats the purpose of a test.
            row = results[results['url'] == url]
            if row.empty:
                raise ValueError('called with URL we did not expect')
            index = row.index[0]
            delay = response_times[index]
            await asyncio.sleep(delay)

            status = row.at[index, 'status']
            text = row.at[index, 'html']

            if status == 'Timed out':
                raise asyncio.TimeoutError
            elif status == 'Invalid URL':
                raise aiohttp.InvalidURL(url)
            elif status == "Can't connect: blah":
                raise aiohttp.client_exceptions.ClientConnectionError('blah')
            else:
                return MockResponse(int(status), text)

        with patch('aiohttp.ClientSession') as session:
            urls = results['url'].tolist()
            session_mock = session.return_value
            session_mock.get.side_effect = session_get

            # mock the output table format scraper expects
            out_table = pd.DataFrame(data={
                'url': urls,
                'status': ''
            },
                                     columns=['url', 'status', 'html'])

            event_loop = asyncio.get_event_loop()
            event_loop.run_until_complete(scrape_urls(urls, out_table))

            assert_frame_equal(out_table[['url', 'status', 'html']],
                               results[['url', 'status', 'html']])

            # ensure aiohttp.get() called with the right sequence of urls
            call_urls = [
                args[0] for name, args, kwargs in session_mock.get.mock_calls
            ]
            self.assertEqual(set(call_urls), set(urls))