Ejemplo n.º 1
0
    def test_can_wait_when_no_urls(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        otto.wait()

        logging_mock.assert_calls(
            'No urls to wait for. Returning immediately.')
Ejemplo n.º 2
0
    def test_can_stop(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()

        otto.stop()

        expect(otto.ioloop.stop.called).to_be_true()
Ejemplo n.º 3
0
    def test_can_handle_wait_timeout(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()

        otto.handle_wait_timeout(1, None)

        expect(otto.ioloop.stop.called).to_be_true()
Ejemplo n.º 4
0
    def test_can_enqueue_url_and_fetch(self, fetch_mock):
        otto = TornadoOctopus(cache=True)

        otto.enqueue('http://www.google.com', None, method='GET', something="else")

        expect(otto.url_queue).to_be_empty()
        fetch_mock.assert_called_once_with('http://www.google.com', None, 'GET', something='else')
Ejemplo n.º 5
0
    def test_can_stop(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()

        otto.stop()

        expect(otto.ioloop.stop.called).to_be_true()
Ejemplo n.º 6
0
    def test_can_handle_wait_timeout(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()

        otto.handle_wait_timeout(1, None)

        expect(otto.ioloop.stop.called).to_be_true()
Ejemplo n.º 7
0
def download():
    otto = TornadoOctopus(
        concurrency=100, auto_start=True,
    )

    def enqueue(size, index, width, height):
        url = 'http://lorempixel.com/%d/%d/' % (width, height)
        print "Enqueuing %s image %d..." % (size, i + 1)
        otto.enqueue(url, handle_url_response(size, i + 1, width, height))

    def handle_url_response(size, index, width, height):
        def handle(url, response):
            if response.status_code != 200:
                print "%s image %d (%d) failed." % (size, index, response.status_code)
                enqueue(size, index, width, height)
                return

            print "%s image %d (%d) saved." % (size, index, response.status_code)

            path = './tests/fixtures/imageset/%s' % size
            if not exists(path):
                os.makedirs(path)
            jpg = StringIO(response.text)
            img = Image.open(jpg)
            img.save('%s/image_%d.jpg' % (path, index))

        return handle

    for i in range(NUM_IMAGES):
        for size, width, height in SIZES:
            enqueue(size, i + 1, width, height)

    otto.wait()  # waits until queue is empty or timeout is ellapsed
Ejemplo n.º 8
0
    def test_can_wait_when_urls_and_timeout(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait()

        expect(otto.ioloop.set_blocking_signal_threshold.called)
Ejemplo n.º 9
0
    def test_can_wait_when_urls_and_no_timeout(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait(0)

        logging_mock.assert_calls('Waiting for urls to be retrieved.')
Ejemplo n.º 10
0
    def test_can_enqueue_url(self):
        otto = TornadoOctopus(cache=False, concurrency=0)

        otto.enqueue('http://www.google.com',
                     None,
                     method='GET',
                     something="else")

        expect(otto.url_queue).to_length(1)
Ejemplo n.º 11
0
 def start_otto(self):
     self.info('Starting Octopus with %d concurrent threads.' %
               self.options.concurrency)
     self.otto = TornadoOctopus(
         concurrency=self.options.concurrency,
         cache=self.options.cache,
         connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
         request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
         limiter=self.get_otto_limiter())
     self.otto.start()
Ejemplo n.º 12
0
    def test_can_fetch(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.response_cache.put('http://www.google.com', Mock())

        http_client_mock = Mock()
        otto.http_client = http_client_mock

        otto.fetch('http://www.google.com', None, 'GET')

        expect(otto.running_urls).to_equal(1)
        expect(http_client_mock.fetch.called).to_be_true()
Ejemplo n.º 13
0
    def test_can_fetch(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.response_cache.put('http://www.google.com', Mock())

        http_client_mock = Mock()
        otto.http_client = http_client_mock

        otto.fetch('http://www.google.com', None, 'GET')

        expect(otto.running_urls).to_equal(1)
        expect(http_client_mock.fetch.called).to_be_true()
Ejemplo n.º 14
0
    def test_can_enqueue_and_get_when_cache_miss(self):
        otto = TornadoOctopus(cache=True, auto_start=True)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')
        otto.wait(2)

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
Ejemplo n.º 15
0
    def test_can_handle_exception(self, log_mock):
        url = 'http://www.globo.com'

        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            raise RuntimeError(url)

        otto.enqueue(url, handle_url_response)

        otto.wait(2)

        log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.')
Ejemplo n.º 16
0
    def test_can_enqueue_url_and_fetch(self, fetch_mock):
        otto = TornadoOctopus(cache=True)

        otto.enqueue('http://www.google.com',
                     None,
                     method='GET',
                     something="else")

        expect(otto.url_queue).to_be_empty()
        fetch_mock.assert_called_once_with('http://www.google.com',
                                           None,
                                           'GET',
                                           something='else')
Ejemplo n.º 17
0
    def test_can_handle_timeouts(self):
        url = 'http://baidu.com'
        otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 18
0
    def test_can_enqueue_and_get_from_cache(self):
        mock_response = Mock()
        otto = TornadoOctopus(cache=True)
        otto.response_cache.put('http://www.google.com', mock_response)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
        expect(self.response).to_equal(mock_response)
Ejemplo n.º 19
0
    def test_can_enqueue_and_get_from_cache(self):
        mock_response = Mock()
        otto = TornadoOctopus(cache=True)
        otto.response_cache.put('http://www.google.com', mock_response)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
        expect(self.response).to_equal(mock_response)
Ejemplo n.º 20
0
    def test_handle_request(self, stop_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        response = self.get_response()

        callback = Mock()

        handle_request = otto.handle_request('some url', callback)

        handle_request(response)

        expect(otto.running_urls).to_equal(-1)
        expect(callback.called).to_be_true()
        expect(stop_mock.called).to_be_true()
Ejemplo n.º 21
0
    def test_handle_request(self, stop_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        response = self.get_response()

        callback = Mock()

        handle_request = otto.handle_request('some url', callback)

        handle_request(response)

        expect(otto.running_urls).to_equal(-1)
        expect(callback.called).to_be_true()
        expect(stop_mock.called).to_be_true()
Ejemplo n.º 22
0
    def test_fetch_gets_the_response_from_cache_if_available(self):
        otto = TornadoOctopus(cache=True, auto_start=True)
        response_mock = Mock()
        otto.response_cache.put('http://www.google.com', response_mock)

        http_client_mock = Mock()
        otto.http_client = http_client_mock

        callback = Mock()

        otto.fetch('http://www.google.com', callback, 'GET')

        expect(otto.running_urls).to_equal(0)
        expect(http_client_mock.fetch.called).to_be_false()
        callback.assert_called_once_with('http://www.google.com', response_mock)
Ejemplo n.º 23
0
    def test_can_handle_invalid_urls(self):
        url = 'http://kagdjdkjgka.fk'
        otto = TornadoOctopus(concurrency=1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response).not_to_be_null()
        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 24
0
    def test_handle_request_when_queue_has_no_items_but_running_urls(self):
        otto = TornadoOctopus(cache=True, auto_start=True)
        otto.response_cache = Mock()
        otto.running_urls = 10

        response = self.get_response()

        callback = Mock()

        handle_request = otto.handle_request('some url', callback)

        handle_request(response)

        expect(otto.running_urls).to_equal(9)
        expect(callback.called).to_be_true()
        expect(otto.response_cache.put.called).to_be_true()
Ejemplo n.º 25
0
    def test_fetch_gets_the_response_from_cache_if_available(self):
        otto = TornadoOctopus(cache=True, auto_start=True)
        response_mock = Mock()
        otto.response_cache.put('http://www.google.com', response_mock)

        http_client_mock = Mock()
        otto.http_client = http_client_mock

        callback = Mock()

        otto.fetch('http://www.google.com', callback, 'GET')

        expect(otto.running_urls).to_equal(0)
        expect(http_client_mock.fetch.called).to_be_false()
        callback.assert_called_once_with('http://www.google.com',
                                         response_mock)
Ejemplo n.º 26
0
    def test_handle_request_when_queue_has_no_items_but_running_urls(self):
        otto = TornadoOctopus(cache=True, auto_start=True)
        otto.response_cache = Mock()
        otto.running_urls = 10

        response = self.get_response()

        callback = Mock()

        handle_request = otto.handle_request('some url', callback)

        handle_request(response)

        expect(otto.running_urls).to_equal(9)
        expect(callback.called).to_be_true()
        expect(otto.response_cache.put.called).to_be_true()
Ejemplo n.º 27
0
 def start_otto(self):
     self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency)
     self.otto = TornadoOctopus(
         concurrency=self.options.concurrency, cache=self.options.cache,
         connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
         request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
         limiter=self.get_otto_limiter()
     )
     self.otto.start()
def get_avatars(urls):
    avatars = []

    otto = TornadoOctopus(
        concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60
    )

    def handle_url_response(url, response):
        if 'Not found' == response.text:
            print 'URL Not Found: %s' % url
        else:
            avatars.append(response.text)

    for url in urls:
        otto.enqueue(url, handle_url_response)

    otto.wait()

    return avatars
Ejemplo n.º 29
0
    def test_handle_request_when_queue_has_items(self, fetch_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        handler_mock = Mock()

        otto.url_queue.append(
            ('other url', handler_mock, 'POST', {'foo': 'bar'})
        )

        response = self.get_response()
        callback = Mock()

        handle_request = otto.handle_request('some url', callback)
        handle_request(response)

        expect(otto.running_urls).to_equal(-1)
        expect(otto.url_queue).to_be_empty()
        expect(callback.called).to_be_true()
        fetch_mock.assert_called_once_with('other url', handler_mock, 'POST', foo='bar')
Ejemplo n.º 30
0
    def test_can_get_response_from_tornado_response_when_no_cookies(self):
        response = self.get_response(request=Mock(headers={}))

        otto_response = TornadoOctopus.from_tornado_response('http://www.google.com', response)

        expect(otto_response.url).to_equal('http://www.google.com')
        expect(otto_response.headers).to_be_like(response.headers)
        expect(otto_response.cookies).to_be_empty()
        expect(otto_response.text).to_equal('body')
        expect(otto_response.error).to_equal('error')
        expect(otto_response.request_time).to_equal(2.1)
Ejemplo n.º 31
0
    def test_can_get_response_from_tornado_response_when_no_cookies(self):
        response = self.get_response(request=Mock(headers={}))

        otto_response = TornadoOctopus.from_tornado_response(
            'http://www.google.com', response)

        expect(otto_response.url).to_equal('http://www.google.com')
        expect(otto_response.headers).to_be_like(response.headers)
        expect(otto_response.cookies).to_be_empty()
        expect(otto_response.text).to_equal('body')
        expect(otto_response.error).to_equal('error')
        expect(otto_response.request_time).to_equal(2.1)
Ejemplo n.º 32
0
    def test_can_get_response_from_tornado_response(self):
        response = self.get_response()

        otto_response = TornadoOctopus.from_tornado_response(
            'http://www.google.com', response)

        expect(otto_response.url).to_equal('http://www.google.com')
        expect(otto_response.headers).to_be_like(response.headers)
        expect(otto_response.cookies).to_be_like({'foo': 'bar'})
        expect(otto_response.text).to_equal('body')
        expect(otto_response.error).to_equal('error')
        expect(otto_response.request_time).to_equal(2.1)
Ejemplo n.º 33
0
    def test_can_get_many_urls(self):
        urls = [
            'http://www.globo.com',
            'http://www.twitter.com',
            'http://www.facebook.com'
        ]
        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            self.responses[url] = response

        for url in urls:
            otto.enqueue(url, handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(3)

        for url in urls:
            expect(self.responses).to_include(url)
            expect(self.responses[url].status_code).to_equal(200)
Ejemplo n.º 34
0
    def test_handle_request_when_queue_has_items(self, fetch_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        handler_mock = Mock()

        otto.url_queue.append(('other url', handler_mock, 'POST', {
            'foo': 'bar'
        }))

        response = self.get_response()
        callback = Mock()

        handle_request = otto.handle_request('some url', callback)
        handle_request(response)

        expect(otto.running_urls).to_equal(-1)
        expect(otto.url_queue).to_be_empty()
        expect(callback.called).to_be_true()
        fetch_mock.assert_called_once_with('other url',
                                           handler_mock,
                                           'POST',
                                           foo='bar')
Ejemplo n.º 35
0
    def test_can_get_response_from_tornado_response(self):
        response = self.get_response()

        otto_response = TornadoOctopus.from_tornado_response('http://www.google.com', response)

        expect(otto_response.url).to_equal('http://www.google.com')
        expect(otto_response.headers).to_be_like(response.headers)
        expect(otto_response.cookies).to_be_like({
            'foo': 'bar'
        })
        expect(otto_response.text).to_equal('body')
        expect(otto_response.error).to_equal('error')
        expect(otto_response.request_time).to_equal(2.1)
Ejemplo n.º 36
0
def _download_url_list(image_url_list):
    '''Downloads the image sources of images listed on `image_url_list`
    '''

    images = []

    otto = TornadoOctopus(
        concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60
    )

    def handle_url_response(url, response):
        if 'Not found' == response.text:
            print url
        else:
            images.append(response.text)

    for url in image_url_list:
        otto.enqueue(url, handle_url_response)

    otto.wait(0)

    return images
Ejemplo n.º 37
0
    def test_can_wait_when_urls_and_timeout(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait()

        expect(otto.ioloop.set_blocking_signal_threshold.called)
Ejemplo n.º 38
0
    def test_can_wait_when_urls_and_no_timeout(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait(0)

        logging_mock.assert_calls('Waiting for urls to be retrieved.')
Ejemplo n.º 39
0
    def test_can_create_tornado_otto(self):
        otto = TornadoOctopus()

        expect(otto.concurrency).to_equal(10)
        expect(otto.auto_start).to_be_false()
        expect(otto.cache).to_be_false()

        expect(otto.response_cache).not_to_be_null()
        expect(otto.response_cache).to_be_instance_of(Cache)
        expect(otto.response_cache.expiration_in_seconds).to_equal(30)

        expect(otto.request_timeout_in_seconds).to_equal(10)
        expect(otto.connect_timeout_in_seconds).to_equal(5)
        expect(otto.ignore_pycurl).to_be_false()

        expect(otto.running_urls).to_equal(0)
        expect(otto.url_queue).to_be_empty()
Ejemplo n.º 40
0
    def test_can_enqueue_and_get_when_cache_miss(self):
        otto = TornadoOctopus(cache=True, auto_start=True)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')
        otto.wait(2)

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
Ejemplo n.º 41
0
    def test_can_handle_exception(self, log_mock):
        url = 'http://www.globo.com'

        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            raise RuntimeError(url)

        otto.enqueue(url, handle_url_response)

        otto.wait(2)

        log_mock.assert_called_once_with(
            'Error calling callback for http://www.globo.com.')
Ejemplo n.º 42
0
    def test_can_handle_invalid_urls(self):
        url = 'http://kagdjdkjgka.fk'
        otto = TornadoOctopus(concurrency=1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response).not_to_be_null()
        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 43
0
    def test_can_handle_timeouts(self):
        url = 'http://baidu.com'
        otto = TornadoOctopus(concurrency=1,
                              request_timeout_in_seconds=0.1,
                              auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 44
0
    def test_can_create_tornado_otto_with_custom_values(self):
        otto = TornadoOctopus(concurrency=20,
                              auto_start=True,
                              cache=True,
                              expiration_in_seconds=60,
                              request_timeout_in_seconds=20,
                              connect_timeout_in_seconds=10,
                              ignore_pycurl=True)

        expect(otto.concurrency).to_equal(20)
        expect(otto.auto_start).to_be_true()
        expect(otto.cache).to_be_true()

        expect(otto.response_cache).not_to_be_null()
        expect(otto.response_cache).to_be_instance_of(Cache)
        expect(otto.response_cache.expiration_in_seconds).to_equal(60)

        expect(otto.request_timeout_in_seconds).to_equal(20)
        expect(otto.connect_timeout_in_seconds).to_equal(10)
        expect(otto.ignore_pycurl).to_be_true()

        expect(otto.running_urls).to_equal(0)
        expect(otto.url_queue).to_be_empty()
Ejemplo n.º 45
0
def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False):
    message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % (
        ignore_pycurl and "using SimpleHTTPClient" or "using pycurl"
    )
    print
    print("=" * len(message))
    print(message)
    print("=" * len(message))
    print

    otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl)

    for url in urls_to_retrieve:
        otto.enqueue(url, handle_url_response)

    start_time = time()
    otto.wait(0)

    return time() - start_time
Ejemplo n.º 46
0
    def test_can_get_many_urls(self):
        urls = [
            'http://www.globo.com', 'http://www.twitter.com',
            'http://www.facebook.com'
        ]
        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            self.responses[url] = response

        for url in urls:
            otto.enqueue(url, handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(3)

        for url in urls:
            expect(self.responses).to_include(url)
            expect(self.responses[url].status_code).to_equal(200)
Ejemplo n.º 47
0
class BaseWorker(BaseCLI):
    def _load_validators(self):
        return load_classes(default=self.config.VALIDATORS)

    def _load_facters(self):
        return load_classes(default=self.config.FACTERS)

    def get_otto_limiter(self):
        domains = self.cache.get_domain_limiters()
        limiter = None

        if domains:
            limiter = Limiter(
                *domains,
                redis=self.redis,
                expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION)

            limiter.subscribe_to_lock_miss(self.handle_limiter_miss)

        return limiter

    def update_otto_limiter(self):
        domains = self.cache.get_domain_limiters()

        if hasattr(self.otto, 'limiter') and self.otto.limiter is not None:
            self.otto.limiter.update_domain_definitions(*domains)

    def start_otto(self):
        self.info('Starting Octopus with %d concurrent threads.' %
                  self.options.concurrency)
        self.otto = TornadoOctopus(
            concurrency=self.options.concurrency,
            cache=self.options.cache,
            connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
            request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
            limiter=self.get_otto_limiter())
        self.otto.start()

    def handle_error(self, exc_type, exc_value, tb):
        try:
            if not self.db.connection().invalidated:
                self.db.rollback()
        except Exception:
            err = sys.exc_info()[1]
            logging.error("Cannot rollback: %s" % str(err))

        self.otto.url_queue = []
        self.db.close_all()
        self.db.remove()
        self.db = scoped_session(self.sqlalchemy_db_maker)

        for handler in self.error_handlers:
            handler.handle_exception(exc_type,
                                     exc_value,
                                     tb,
                                     extra={
                                         'worker-uuid': self.uuid,
                                         'holmes-version': __version__
                                     })

    def async_get(self, url, handler, method='GET', **kw):
        url, response = self.cache.get_request(url)

        kw['user_agent'] = self.config.HOLMES_USER_AGENT

        if not response:
            kw['proxy_host'] = self.config.HTTP_PROXY_HOST
            kw['proxy_port'] = self.config.HTTP_PROXY_PORT

            self.debug('Enqueueing %s for %s...' % (method, url))
            self.otto.enqueue(url, self.handle_response(url, handler), method,
                              **kw)
        else:
            handler(url, response)

    def handle_response(self, url, handler):
        def handle(url, response):
            self.cache.set_request(
                url, response.status_code, response.headers, response.cookies,
                response.text, response.effective_url, response.error,
                response.request_time,
                self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS)
            handler(url, response)

        return handle

    def handle_limiter_miss(self, url):
        pass

    def publish(self, data):
        self.redis_pub_sub.publish('events', data)
    def test_should_call_limiter_miss_twice(self):
        limiter = PerDomainRedisLimiter(
            {'http://g1.globo.com': 1},
            {'http://globoesporte.globo.com': 1},
            redis=self.redis
        )
        limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)

        otto.wait()

        expect(self.cache_miss).to_length(2)
    def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self):
        limiter = PerDomainInMemoryLimiter(
            {'http://g1.globo.com': 1},
            {'http://globoesporte.globo.com': 1}
        )
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(4)
        expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com'])
Ejemplo n.º 50
0
    def test_can_enqueue_url(self):
        otto = TornadoOctopus(cache=False, concurrency=0)

        otto.enqueue('http://www.google.com', None, method='GET', something="else")

        expect(otto.url_queue).to_length(1)
Ejemplo n.º 51
0
class BaseWorker(BaseCLI):
    def _load_validators(self):
        return load_classes(default=self.config.VALIDATORS)

    def _load_facters(self):
        return load_classes(default=self.config.FACTERS)

    def get_otto_limiter(self):
        domains = self.cache.get_domain_limiters()
        limiter = None

        if domains:
            limiter = Limiter(
                *domains,
                redis=self.redis,
                expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION
            )

            limiter.subscribe_to_lock_miss(self.handle_limiter_miss)

        return limiter

    def update_otto_limiter(self):
        domains = self.cache.get_domain_limiters()

        if hasattr(self.otto, 'limiter'):
            self.otto.limiter.update_domain_definitions(*domains)

    def start_otto(self):
        self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency)
        self.otto = TornadoOctopus(
            concurrency=self.options.concurrency, cache=self.options.cache,
            connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
            request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
            limiter=self.get_otto_limiter()
        )
        self.otto.start()

    def handle_error(self, exc_type, exc_value, tb):
        for handler in self.error_handlers:
            handler.handle_exception(
                exc_type, exc_value, tb, extra={
                    'worker-uuid': self.uuid,
                    'holmes-version': __version__
                }
            )

    def async_get(self, url, handler, method='GET', **kw):
        url, response = self.cache.get_request(url)

        if not response:
            kw['proxy_host'] = self.config.HTTP_PROXY_HOST
            kw['proxy_port'] = self.config.HTTP_PROXY_PORT

            self.debug('Enqueueing %s for %s...' % (method, url))
            self.otto.enqueue(url, self.handle_response(url, handler), method, **kw)
        else:
            handler(url, response)

    def handle_response(self, url, handler):
        def handle(url, response):
            self.cache.set_request(
                url, response.status_code, response.headers, response.cookies,
                response.text, response.effective_url, response.error, response.request_time,
                self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS
            )
            handler(url, response)
        return handle

    def handle_limiter_miss(self, url):
        pass

    def publish(self, data):
        self.redis_pub_sub.publish('events', data)

    def _insert_keys(self, keys):
        from holmes.models import Key

        for name in keys.keys():
            self.db.begin(subtransactions=True)
            key = Key.get_or_create(self.db, name)
            keys[name]['key'] = key
            self.db.add(key)
            self.db.commit()
Ejemplo n.º 52
0
class BaseWorker(BaseCLI):
    def _load_validators(self):
        return load_classes(default=self.config.VALIDATORS)

    def _load_facters(self):
        return load_classes(default=self.config.FACTERS)

    def get_otto_limiter(self):
        domains = self.cache.get_domain_limiters()
        limiter = None

        if domains:
            limiter = Limiter(
                *domains,
                redis=self.redis,
                expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION
            )

            limiter.subscribe_to_lock_miss(self.handle_limiter_miss)

        return limiter

    def update_otto_limiter(self):
        domains = self.cache.get_domain_limiters()

        if hasattr(self.otto, 'limiter') and self.otto.limiter is not None:
            self.otto.limiter.update_domain_definitions(*domains)

    def start_otto(self):
        self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency)
        self.otto = TornadoOctopus(
            concurrency=self.options.concurrency, cache=self.options.cache,
            connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
            request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
            limiter=self.get_otto_limiter()
        )
        self.otto.start()

    def handle_error(self, exc_type, exc_value, tb):
        try:
            if not self.db.connection().invalidated:
                self.db.rollback()
        except Exception:
            err = sys.exc_info()[1]
            logging.error("Cannot rollback: %s" % str(err))

        self.otto.url_queue = []
        self.db.close_all()
        self.db.remove()
        self.db = scoped_session(self.sqlalchemy_db_maker)

        for handler in self.error_handlers:
            handler.handle_exception(
                exc_type, exc_value, tb, extra={
                    'worker-uuid': self.uuid,
                    'holmes-version': __version__
                }
            )

    def async_get(self, url, handler, method='GET', **kw):
        url, response = self.cache.get_request(url)

        kw['user_agent'] = self.config.HOLMES_USER_AGENT

        if not response:
            kw['proxy_host'] = self.config.HTTP_PROXY_HOST
            kw['proxy_port'] = self.config.HTTP_PROXY_PORT

            self.debug('Enqueueing %s for %s...' % (method, url))
            self.otto.enqueue(url, self.handle_response(url, handler), method, **kw)
        else:
            handler(url, response)

    def handle_response(self, url, handler):
        def handle(url, response):
            self.cache.set_request(
                url, response.status_code, response.headers, response.cookies,
                response.text, response.effective_url, response.error, response.request_time,
                self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS
            )
            handler(url, response)
        return handle

    def handle_limiter_miss(self, url):
        pass

    def publish(self, data):
        self.redis_pub_sub.publish('events', data)
    def test_should_call_limiter_miss_twice(self):
        limiter = PerDomainRedisLimiter({'http://g1.globo.com': 1},
                                        {'http://globoesporte.globo.com': 1},
                                        redis=self.redis)
        limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com/',
                     self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/',
                     self.handle_url_response)
        otto.enqueue('http://g1.globo.com/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)

        otto.wait()

        expect(self.cache_miss).to_length(2)
Ejemplo n.º 54
0
    def test_can_wait_when_no_urls(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        otto.wait()

        logging_mock.assert_calls('No urls to wait for. Returning immediately.')
    def test_should_not_get_more_than_one_url_for_same_domain_concurrently(
            self):
        limiter = PerDomainInMemoryLimiter(
            {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1})
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/',
                     self.handle_url_response)
        otto.enqueue('http://g1.globo.com', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(4)
        expect(list(limiter.domain_count.keys())).to_be_like(
            ['http://g1.globo.com', 'http://globoesporte.globo.com'])
Ejemplo n.º 56
0
    def test_can_get_queue_info(self):
        otto = TornadoOctopus()

        expect(otto.queue_size).to_equal(0)
        expect(otto.is_empty).to_be_true()