Ejemplo n.º 1
0
    def test_can_enqueue_url_and_fetch(self, fetch_mock):
        otto = TornadoOctopus(cache=True)

        otto.enqueue('http://www.google.com', None, method='GET', something="else")

        expect(otto.url_queue).to_be_empty()
        fetch_mock.assert_called_once_with('http://www.google.com', None, 'GET', something='else')
Ejemplo n.º 2
0
    def test_can_enqueue_url(self):
        otto = TornadoOctopus(cache=False, concurrency=0)

        otto.enqueue('http://www.google.com',
                     None,
                     method='GET',
                     something="else")

        expect(otto.url_queue).to_length(1)
Ejemplo n.º 3
0
    def test_can_enqueue_and_get_when_cache_miss(self):
        otto = TornadoOctopus(cache=True, auto_start=True)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')
        otto.wait(2)

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
Ejemplo n.º 4
0
    def test_can_enqueue_and_get_when_cache_miss(self):
        otto = TornadoOctopus(cache=True, auto_start=True)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')
        otto.wait(2)

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
Ejemplo n.º 5
0
    def test_can_enqueue_url_and_fetch(self, fetch_mock):
        otto = TornadoOctopus(cache=True)

        otto.enqueue('http://www.google.com',
                     None,
                     method='GET',
                     something="else")

        expect(otto.url_queue).to_be_empty()
        fetch_mock.assert_called_once_with('http://www.google.com',
                                           None,
                                           'GET',
                                           something='else')
Ejemplo n.º 6
0
    def test_can_handle_exception(self, log_mock):
        url = 'http://www.globo.com'

        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            raise RuntimeError(url)

        otto.enqueue(url, handle_url_response)

        otto.wait(2)

        log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.')
Ejemplo n.º 7
0
    def test_can_enqueue_and_get_from_cache(self):
        mock_response = Mock()
        otto = TornadoOctopus(cache=True)
        otto.response_cache.put('http://www.google.com', mock_response)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
        expect(self.response).to_equal(mock_response)
Ejemplo n.º 8
0
    def test_can_handle_timeouts(self):
        url = 'http://baidu.com'
        otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 9
0
    def test_can_handle_exception(self, log_mock):
        url = 'http://www.globo.com'

        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            raise RuntimeError(url)

        otto.enqueue(url, handle_url_response)

        otto.wait(2)

        log_mock.assert_called_once_with(
            'Error calling callback for http://www.globo.com.')
Ejemplo n.º 10
0
    def test_can_enqueue_and_get_from_cache(self):
        mock_response = Mock()
        otto = TornadoOctopus(cache=True)
        otto.response_cache.put('http://www.google.com', mock_response)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
        expect(self.response).to_equal(mock_response)
Ejemplo n.º 11
0
    def test_can_handle_invalid_urls(self):
        url = 'http://kagdjdkjgka.fk'
        otto = TornadoOctopus(concurrency=1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response).not_to_be_null()
        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 12
0
    def test_can_handle_invalid_urls(self):
        url = 'http://kagdjdkjgka.fk'
        otto = TornadoOctopus(concurrency=1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response).not_to_be_null()
        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 13
0
    def test_can_handle_timeouts(self):
        url = 'http://baidu.com'
        otto = TornadoOctopus(concurrency=1,
                              request_timeout_in_seconds=0.1,
                              auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Ejemplo n.º 14
0
def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False):
    message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % (
        ignore_pycurl and "using SimpleHTTPClient" or "using pycurl"
    )
    print
    print("=" * len(message))
    print(message)
    print("=" * len(message))
    print

    otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl)

    for url in urls_to_retrieve:
        otto.enqueue(url, handle_url_response)

    start_time = time()
    otto.wait(0)

    return time() - start_time
def get_avatars(urls):
    avatars = []

    otto = TornadoOctopus(
        concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60
    )

    def handle_url_response(url, response):
        if 'Not found' == response.text:
            print 'URL Not Found: %s' % url
        else:
            avatars.append(response.text)

    for url in urls:
        otto.enqueue(url, handle_url_response)

    otto.wait()

    return avatars
Ejemplo n.º 16
0
    def test_can_get_many_urls(self):
        urls = [
            'http://www.globo.com', 'http://www.twitter.com',
            'http://www.facebook.com'
        ]
        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            self.responses[url] = response

        for url in urls:
            otto.enqueue(url, handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(3)

        for url in urls:
            expect(self.responses).to_include(url)
            expect(self.responses[url].status_code).to_equal(200)
Ejemplo n.º 17
0
    def test_can_get_many_urls(self):
        urls = [
            'http://www.globo.com',
            'http://www.twitter.com',
            'http://www.facebook.com'
        ]
        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            self.responses[url] = response

        for url in urls:
            otto.enqueue(url, handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(3)

        for url in urls:
            expect(self.responses).to_include(url)
            expect(self.responses[url].status_code).to_equal(200)
Ejemplo n.º 18
0
def _download_url_list(image_url_list):
    '''Downloads the image sources of images listed on `image_url_list`
    '''

    images = []

    otto = TornadoOctopus(
        concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60
    )

    def handle_url_response(url, response):
        if 'Not found' == response.text:
            print url
        else:
            images.append(response.text)

    for url in image_url_list:
        otto.enqueue(url, handle_url_response)

    otto.wait(0)

    return images
    def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self):
        limiter = PerDomainInMemoryLimiter(
            {'http://g1.globo.com': 1},
            {'http://globoesporte.globo.com': 1}
        )
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(4)
        expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com'])
    def test_should_not_get_more_than_one_url_for_same_domain_concurrently(
            self):
        limiter = PerDomainInMemoryLimiter(
            {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1})
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/',
                     self.handle_url_response)
        otto.enqueue('http://g1.globo.com', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(4)
        expect(list(limiter.domain_count.keys())).to_be_like(
            ['http://g1.globo.com', 'http://globoesporte.globo.com'])
    def test_should_call_limiter_miss_twice(self):
        limiter = PerDomainRedisLimiter({'http://g1.globo.com': 1},
                                        {'http://globoesporte.globo.com': 1},
                                        redis=self.redis)
        limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com/',
                     self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/',
                     self.handle_url_response)
        otto.enqueue('http://g1.globo.com/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)

        otto.wait()

        expect(self.cache_miss).to_length(2)
    def test_should_call_limiter_miss_twice(self):
        limiter = PerDomainRedisLimiter(
            {'http://g1.globo.com': 1},
            {'http://globoesporte.globo.com': 1},
            redis=self.redis
        )
        limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)

        otto.wait()

        expect(self.cache_miss).to_length(2)
Ejemplo n.º 23
0
class BaseWorker(BaseCLI):
    def _load_validators(self):
        return load_classes(default=self.config.VALIDATORS)

    def _load_facters(self):
        return load_classes(default=self.config.FACTERS)

    def get_otto_limiter(self):
        domains = self.cache.get_domain_limiters()
        limiter = None

        if domains:
            limiter = Limiter(
                *domains,
                redis=self.redis,
                expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION
            )

            limiter.subscribe_to_lock_miss(self.handle_limiter_miss)

        return limiter

    def update_otto_limiter(self):
        domains = self.cache.get_domain_limiters()

        if hasattr(self.otto, 'limiter'):
            self.otto.limiter.update_domain_definitions(*domains)

    def start_otto(self):
        self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency)
        self.otto = TornadoOctopus(
            concurrency=self.options.concurrency, cache=self.options.cache,
            connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
            request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
            limiter=self.get_otto_limiter()
        )
        self.otto.start()

    def handle_error(self, exc_type, exc_value, tb):
        for handler in self.error_handlers:
            handler.handle_exception(
                exc_type, exc_value, tb, extra={
                    'worker-uuid': self.uuid,
                    'holmes-version': __version__
                }
            )

    def async_get(self, url, handler, method='GET', **kw):
        url, response = self.cache.get_request(url)

        if not response:
            kw['proxy_host'] = self.config.HTTP_PROXY_HOST
            kw['proxy_port'] = self.config.HTTP_PROXY_PORT

            self.debug('Enqueueing %s for %s...' % (method, url))
            self.otto.enqueue(url, self.handle_response(url, handler), method, **kw)
        else:
            handler(url, response)

    def handle_response(self, url, handler):
        def handle(url, response):
            self.cache.set_request(
                url, response.status_code, response.headers, response.cookies,
                response.text, response.effective_url, response.error, response.request_time,
                self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS
            )
            handler(url, response)
        return handle

    def handle_limiter_miss(self, url):
        pass

    def publish(self, data):
        self.redis_pub_sub.publish('events', data)

    def _insert_keys(self, keys):
        from holmes.models import Key

        for name in keys.keys():
            self.db.begin(subtransactions=True)
            key = Key.get_or_create(self.db, name)
            keys[name]['key'] = key
            self.db.add(key)
            self.db.commit()
Ejemplo n.º 24
0
class BaseWorker(BaseCLI):
    def _load_validators(self):
        return load_classes(default=self.config.VALIDATORS)

    def _load_facters(self):
        return load_classes(default=self.config.FACTERS)

    def get_otto_limiter(self):
        domains = self.cache.get_domain_limiters()
        limiter = None

        if domains:
            limiter = Limiter(
                *domains,
                redis=self.redis,
                expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION
            )

            limiter.subscribe_to_lock_miss(self.handle_limiter_miss)

        return limiter

    def update_otto_limiter(self):
        domains = self.cache.get_domain_limiters()

        if hasattr(self.otto, 'limiter') and self.otto.limiter is not None:
            self.otto.limiter.update_domain_definitions(*domains)

    def start_otto(self):
        self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency)
        self.otto = TornadoOctopus(
            concurrency=self.options.concurrency, cache=self.options.cache,
            connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
            request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
            limiter=self.get_otto_limiter()
        )
        self.otto.start()

    def handle_error(self, exc_type, exc_value, tb):
        try:
            if not self.db.connection().invalidated:
                self.db.rollback()
        except Exception:
            err = sys.exc_info()[1]
            logging.error("Cannot rollback: %s" % str(err))

        self.otto.url_queue = []
        self.db.close_all()
        self.db.remove()
        self.db = scoped_session(self.sqlalchemy_db_maker)

        for handler in self.error_handlers:
            handler.handle_exception(
                exc_type, exc_value, tb, extra={
                    'worker-uuid': self.uuid,
                    'holmes-version': __version__
                }
            )

    def async_get(self, url, handler, method='GET', **kw):
        url, response = self.cache.get_request(url)

        kw['user_agent'] = self.config.HOLMES_USER_AGENT

        if not response:
            kw['proxy_host'] = self.config.HTTP_PROXY_HOST
            kw['proxy_port'] = self.config.HTTP_PROXY_PORT

            self.debug('Enqueueing %s for %s...' % (method, url))
            self.otto.enqueue(url, self.handle_response(url, handler), method, **kw)
        else:
            handler(url, response)

    def handle_response(self, url, handler):
        def handle(url, response):
            self.cache.set_request(
                url, response.status_code, response.headers, response.cookies,
                response.text, response.effective_url, response.error, response.request_time,
                self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS
            )
            handler(url, response)
        return handle

    def handle_limiter_miss(self, url):
        pass

    def publish(self, data):
        self.redis_pub_sub.publish('events', data)
Ejemplo n.º 25
0
class BaseWorker(BaseCLI):
    def _load_validators(self):
        return load_classes(default=self.config.VALIDATORS)

    def _load_facters(self):
        return load_classes(default=self.config.FACTERS)

    def get_otto_limiter(self):
        domains = self.cache.get_domain_limiters()
        limiter = None

        if domains:
            limiter = Limiter(
                *domains,
                redis=self.redis,
                expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION)

            limiter.subscribe_to_lock_miss(self.handle_limiter_miss)

        return limiter

    def update_otto_limiter(self):
        domains = self.cache.get_domain_limiters()

        if hasattr(self.otto, 'limiter') and self.otto.limiter is not None:
            self.otto.limiter.update_domain_definitions(*domains)

    def start_otto(self):
        self.info('Starting Octopus with %d concurrent threads.' %
                  self.options.concurrency)
        self.otto = TornadoOctopus(
            concurrency=self.options.concurrency,
            cache=self.options.cache,
            connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS,
            request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS,
            limiter=self.get_otto_limiter())
        self.otto.start()

    def handle_error(self, exc_type, exc_value, tb):
        try:
            if not self.db.connection().invalidated:
                self.db.rollback()
        except Exception:
            err = sys.exc_info()[1]
            logging.error("Cannot rollback: %s" % str(err))

        self.otto.url_queue = []
        self.db.close_all()
        self.db.remove()
        self.db = scoped_session(self.sqlalchemy_db_maker)

        for handler in self.error_handlers:
            handler.handle_exception(exc_type,
                                     exc_value,
                                     tb,
                                     extra={
                                         'worker-uuid': self.uuid,
                                         'holmes-version': __version__
                                     })

    def async_get(self, url, handler, method='GET', **kw):
        url, response = self.cache.get_request(url)

        kw['user_agent'] = self.config.HOLMES_USER_AGENT

        if not response:
            kw['proxy_host'] = self.config.HTTP_PROXY_HOST
            kw['proxy_port'] = self.config.HTTP_PROXY_PORT

            self.debug('Enqueueing %s for %s...' % (method, url))
            self.otto.enqueue(url, self.handle_response(url, handler), method,
                              **kw)
        else:
            handler(url, response)

    def handle_response(self, url, handler):
        def handle(url, response):
            self.cache.set_request(
                url, response.status_code, response.headers, response.cookies,
                response.text, response.effective_url, response.error,
                response.request_time,
                self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS)
            handler(url, response)

        return handle

    def handle_limiter_miss(self, url):
        pass

    def publish(self, data):
        self.redis_pub_sub.publish('events', data)
Ejemplo n.º 26
0
    def test_can_enqueue_url(self):
        otto = TornadoOctopus(cache=False, concurrency=0)

        otto.enqueue('http://www.google.com', None, method='GET', something="else")

        expect(otto.url_queue).to_length(1)