Example #1
0
    def test_crawl(self, mock_get_page):
        # NB: this is far more of an integration test than a unit test. Breaking
        # it down into more unit-like elements might be a good idea, partly
        # because it takes a huge time time run compared to the other tests
        # because we crawl the entirety of a documentation dump from the
        # internet.
        file_getter = FileGetter('aiohttp.readthedocs.org',
                                 'aiohttp.readthedocs.org')
        mock_get_page.side_effect = asyncio.coroutine(
            lambda client, url: file_getter.get(url))
        pages = self.loop.run_until_complete(
            crawl(None,
                  'http://aiohttp.readthedocs.org/en/stable/index.html',
                  loop=self.loop))

        # Rudimentary check to see that we've fetched all the pages referenced
        # by other pages:
        all_referenced_urls = set()
        for url, page in pages.items():
            if isinstance(page, Page):
                all_referenced_urls |= page.internal_urls - page.resource_urls
        self.assertEqual(set(pages), all_referenced_urls)

        self.assertIsInstance(
            pages['http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/'
                  '_multidict.html'], Exception)
Example #2
0
    def test_crawl(self, mock_get_page):
        # NB: this is far more of an integration test than a unit test. Breaking
        # it down into more unit-like elements might be a good idea, partly
        # because it takes a huge time time run compared to the other tests
        # because we crawl the entirety of a documentation dump from the
        # internet.
        file_getter = FileGetter(
            'aiohttp.readthedocs.org', 'aiohttp.readthedocs.org')
        mock_get_page.side_effect = asyncio.coroutine(
            lambda client, url: file_getter.get(url))
        pages = self.loop.run_until_complete(crawl(
            None, 'http://aiohttp.readthedocs.org/en/stable/index.html',
            loop=self.loop))

        # Rudimentary check to see that we've fetched all the pages referenced
        # by other pages:
        all_referenced_urls = set()
        for url, page in pages.items():
            if isinstance(page, Page):
                all_referenced_urls |= page.internal_urls - page.resource_urls
        self.assertEqual(set(pages), all_referenced_urls)

        self.assertIsInstance(pages[
            'http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/'
            '_multidict.html'], Exception)
Example #3
0
def main(url, max_concurrent_requests, resources):
    loop = asyncio.get_event_loop()
    client = LimitedClientSession(max_concurrent_requests, loop=loop)
    try:
        crawled_pages = loop.run_until_complete(crawl(client, url, loop=loop))
    finally:
        client.close()
        loop.close()
    return make_graph('url_here', crawled_pages, resources)
Example #4
0
def main(url, max_concurrent_requests, resources):
    loop = asyncio.get_event_loop()
    client = LimitedClientSession(max_concurrent_requests, loop=loop)
    try:
        crawled_pages = loop.run_until_complete(
            crawl(client, url, loop=loop))
    finally:
        client.close()
        loop.close()
    return make_graph('url_here', crawled_pages, resources)