Python crawl Examples

Programming Language: Python

Namespace/Package Name: aranea.crawler

Method/Function: crawl

Examples at hotexamples.com: 4

Python crawl - 4 examples found. These are the top rated real world Python examples of aranea.crawler.crawl extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def test_crawl(self, mock_get_page):
        # NB: this is far more of an integration test than a unit test. Breaking
        # it down into more unit-like elements might be a good idea, partly
        # because it takes a huge time time run compared to the other tests
        # because we crawl the entirety of a documentation dump from the
        # internet.
        file_getter = FileGetter('aiohttp.readthedocs.org',
                                 'aiohttp.readthedocs.org')
        mock_get_page.side_effect = asyncio.coroutine(
            lambda client, url: file_getter.get(url))
        pages = self.loop.run_until_complete(
            crawl(None,
                  'http://aiohttp.readthedocs.org/en/stable/index.html',
                  loop=self.loop))

        # Rudimentary check to see that we've fetched all the pages referenced
        # by other pages:
        all_referenced_urls = set()
        for url, page in pages.items():
            if isinstance(page, Page):
                all_referenced_urls |= page.internal_urls - page.resource_urls
        self.assertEqual(set(pages), all_referenced_urls)

        self.assertIsInstance(
            pages['http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/'
                  '_multidict.html'], Exception)

Example #2

Show file

File: test_crawler.py Project: ch3pjw/aranea

    def test_crawl(self, mock_get_page):
        # NB: this is far more of an integration test than a unit test. Breaking
        # it down into more unit-like elements might be a good idea, partly
        # because it takes a huge time time run compared to the other tests
        # because we crawl the entirety of a documentation dump from the
        # internet.
        file_getter = FileGetter(
            'aiohttp.readthedocs.org', 'aiohttp.readthedocs.org')
        mock_get_page.side_effect = asyncio.coroutine(
            lambda client, url: file_getter.get(url))
        pages = self.loop.run_until_complete(crawl(
            None, 'http://aiohttp.readthedocs.org/en/stable/index.html',
            loop=self.loop))

        # Rudimentary check to see that we've fetched all the pages referenced
        # by other pages:
        all_referenced_urls = set()
        for url, page in pages.items():
            if isinstance(page, Page):
                all_referenced_urls |= page.internal_urls - page.resource_urls
        self.assertEqual(set(pages), all_referenced_urls)

        self.assertIsInstance(pages[
            'http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/'
            '_multidict.html'], Exception)

Example #3

Show file

def main(url, max_concurrent_requests, resources):
    loop = asyncio.get_event_loop()
    client = LimitedClientSession(max_concurrent_requests, loop=loop)
    try:
        crawled_pages = loop.run_until_complete(crawl(client, url, loop=loop))
    finally:
        client.close()
        loop.close()
    return make_graph('url_here', crawled_pages, resources)

Example #4

Show file

File: crawl.py Project: ch3pjw/aranea

def main(url, max_concurrent_requests, resources):
    loop = asyncio.get_event_loop()
    client = LimitedClientSession(max_concurrent_requests, loop=loop)
    try:
        crawled_pages = loop.run_until_complete(
            crawl(client, url, loop=loop))
    finally:
        client.close()
        loop.close()
    return make_graph('url_here', crawled_pages, resources)