Beispiel #1
0
    def test_crawl_001(self):
        """Must call `api.get_crawl_queue()`."""

        smock.mock('api.get_crawl_queue', returns=[])
        with eventlet.Timeout(DEFAULT_TIMEOUT, False):
            self.client.crawl()
        self.assertTrue(smock.is_called('api.get_crawl_queue'))
Beispiel #2
0
    def test_get_001(self):
        """Must call `storage.meta.query_new_random`."""

        req = webob.Request.blank('/')
        req.method = 'POST'
        req.body = "limit=10"

        smock.mock('StorageConnection.query_new_random', returns=[])
        self.manager.active = True
        self.manager.crawl_queue(req)
        self.assertTrue(smock.is_called('StorageConnection.query_new_random'))
Beispiel #3
0
    def test_get_003(self):
        """Must return no more than requested items."""

        req = webob.Request.blank('/')
        req.method = 'POST'
        req.body = "limit=2"

        items = [{'url': "http://url1/", 'visited': None},
                 {'url': "http://url2/", 'visited': None},
                 {'url': "http://url3/", 'visited': None}]
        items_copy = items[:]
        smock.mock('StorageConnection.query_new_random',
                   returns_func=lambda *a, **kw: [items_copy.pop()] if items_copy else [])
        result = self.manager.crawl_queue(req)
        self.assertTrue(len(result) <= 2)
Beispiel #4
0
 def setUp(self):
     settings.prefetch = {'queue_size': 10,
                          'get_timeout': 0.01,
                          'single_limit': 5,
                          'cache_timeout': 60,
                         }
     settings.postreport = {'queue_size': 10,
                            'flush_size': 1,
                            'flush_delay': 0.01,
                           }
     settings.storage = {'max_connections': 1}
     settings.api = {'max_queue_limit': 100}
     self.manager = Manager()
     # StorageConnection mock
     smock.mock('StorageConnection.__init__', returns=None)
Beispiel #5
0
    def test_crawl_004(self):
        """Must handle invalid URL with empty port number."""

        def mock_get_crawl_queue(_limit):
            return [{'url': "http://localhost:/test_crawl_004_link", 'visited': None, 'links': []}]

        def mock_report_result(report):
            self.assertTrue('Error' in report['result'])
            self.assertEqual(report['url'], "http://localhost:/test_crawl_004_link")

        smock.mock('api.get_crawl_queue', returns_func=mock_get_crawl_queue)
        smock.mock('api.report_result', returns_func=mock_report_result)
        #smock.mock('httplib2.Http.request', returns=(make_http_response(404), ""))
        with eventlet.Timeout(DEFAULT_TIMEOUT, False):
            self.client.crawl()
Beispiel #6
0
    def test_get_002(self):
        """Must return list of items fetched from storage."""

        req = webob.Request.blank('/')
        req.method = 'POST'
        req.body = "limit=10"

        items = [{'url': "http://url1/", 'visited': None},
                 {'url': "http://url2/", 'visited': None},
                 {'url': "http://url3/", 'visited': None}]
        items_copy = items[:]
        smock.mock('StorageConnection.query_new_random',
                   returns_func=lambda *a, **kw: [items_copy.pop()] if items_copy else [])
        self.manager.active = True
        result = self.manager.crawl_queue(req)
        self.assertEqual(sorted(items), sorted(result))
Beispiel #7
0
    def test_crawl_003(self):
        """Must make no more than 5 simultaneous connections to single server."""

        item = {'url': "http://localhost/test_crawl_003_link", 'visited': None, 'links': []}
        flags = {'max_count': 0}
        NUM_ITEMS = self.client.max_connections_per_host * 2
        REQUEST_PAUSE = 0.05
        def mock_httplib2_request_sleep(url, *args, **kwargs): # pylint: disable-msg=W0613
            flags['max_count'] = max(flags['max_count'], self.client.get_active_connections_count('127.0.0.1'))
            eventlet.sleep(REQUEST_PAUSE)
            return make_http_response(404), ""

        smock.mock('api.get_crawl_queue', returns=[])
        smock.mock('api.report_result', returns=None)
        smock.mock('httplib2.Http.request', returns_func=mock_httplib2_request_sleep)
        # prepopulate the queue
        for _ in xrange(NUM_ITEMS):
            self.client.queue.put(item)
        with eventlet.Timeout(DEFAULT_TIMEOUT, False):
            self.client.crawl()
        self.assertTrue(self.client.queue.empty(), u"Crawler didn't consume all queue in allocated time.")
        self.assertTrue(self.client.graceful_stop(timeout=NUM_ITEMS * REQUEST_PAUSE),
                        u"Crawler didn't stop in allocated time.")
        self.assertTrue(flags['max_count'] > 0, u"No connections started at all.")
        self.assertTrue(flags['max_count'] <= self.client.max_connections_per_host,
                        u"Started too many connections.")
Beispiel #8
0
    def test_put_001(self):
        """Must accept one valid report item."""

        req = webob.Request.blank('/')
        req.method = 'PUT'
        url = "http://localhost/manager-test_put_001-url"
        item = {'url': url, 'visited': datetime.now().strftime(TIME_FORMAT),
                'status_code': 200, 'content': "test content",
               }
        req.body = json.dumps(item)

        smock.mock('StorageConnection.save_content', returns=None)
        smock.mock('StorageConnection.query_all_by_url_one', returns={'url': url, 'visited': None})
        smock.mock('StorageConnection.save', returns=None)
        smock.mock('StorageConnection.update', returns=None)
        self.manager.report_result(req)
Beispiel #9
0
    def test_crawl_002(self):
        """Must call `httplib2.Http.request` and `report_item`."""

        def mock_get_crawl_queue(_limit):
            return [{'url': "http://localhost/test_crawl_002_link", 'visited': None, 'links': []}]

        def mock_report_result(report):
            self.assertEqual(report['url'], "http://localhost/test_crawl_002_link")

        smock.mock('api.get_crawl_queue', returns_func=mock_get_crawl_queue)
        smock.mock('api.report_result', returns_func=mock_report_result)
        smock.mock('httplib2.Http.request', returns=(make_http_response(404), ""))
        with eventlet.Timeout(DEFAULT_TIMEOUT, False):
            self.client.crawl()
        self.assertTrue(smock.is_called('httplib2.Http.request'))
        self.assertTrue(smock.is_called('api.report_result'))
Beispiel #10
0
    def setUp(self):
        settings.manager_url = "fake-url"
        settings.socket_timeout = 10
        settings.identity = {'name': "HeroshiBot", 'user_agent': "HeroshiBot/100.500 (lalala)"}
        self.client = Crawler(queue_size=2000, max_connections=20)

        self.uris = []
        self.responses = {}
        self.handlers = {}
        self.requested = []
        self.used_run_crawler = False
        self.on_unexpected_uri = 'fail'
        self.on_unexpected_uri_func = lambda url: self.fail(u"`self.on_unexpected_uri_func` is unset.")
        self.default_hanlder_200 = lambda url: (make_http_response(200), "Dummy page at %s." % (url,))
        self.default_hanlder_404 = lambda url: (make_http_response(404), "Not found: %s." % (url,))

        def mock_httplib2_request(url, *args, **kwargs):
            self.requested.append(url)
            if url in self.responses:
                code, content = self.responses[url]
                return make_http_response(code), content
            elif url in self.handlers:
                handler = self.handlers[url]
                return handler(url)
            else:
                if self.on_unexpected_uri == 'fail':
                    self.fail(u"Unknown URL requested: %s. You didn't register it in `self.uris`." % (url,))
                elif self.on_unexpected_uri == '200':
                    return self.default_hanlder_200(url)
                elif self.on_unexpected_uri == '404':
                    return self.default_hanlder_404(url)
                elif self.on_unexpected_uri == 'call':
                    return make_http_response(*self.on_unexpected_uri_func(url))
                else:
                    self.fail(u"Unknown URL requested: %s. And no code for `self.on_unexpected_uri`: %s." %
                              (url, self.on_unexpected_uri))
            self.fail(u"httplib2_request mock supposed to return somewhere earlier.")

        smock.mock('api.get_crawl_queue', returns=[])
        smock.mock('api.report_result', returns=None)
        smock.mock('httplib2.Http.request', returns_func=mock_httplib2_request)