Beispiel #1
0
    def test_can_wait_when_no_urls(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        otto.wait()

        logging_mock.assert_calls(
            'No urls to wait for. Returning immediately.')
Beispiel #2
0
def download():
    otto = TornadoOctopus(
        concurrency=100, auto_start=True,
    )

    def enqueue(size, index, width, height):
        url = 'http://lorempixel.com/%d/%d/' % (width, height)
        print "Enqueuing %s image %d..." % (size, i + 1)
        otto.enqueue(url, handle_url_response(size, i + 1, width, height))

    def handle_url_response(size, index, width, height):
        def handle(url, response):
            if response.status_code != 200:
                print "%s image %d (%d) failed." % (size, index, response.status_code)
                enqueue(size, index, width, height)
                return

            print "%s image %d (%d) saved." % (size, index, response.status_code)

            path = './tests/fixtures/imageset/%s' % size
            if not exists(path):
                os.makedirs(path)
            jpg = StringIO(response.text)
            img = Image.open(jpg)
            img.save('%s/image_%d.jpg' % (path, index))

        return handle

    for i in range(NUM_IMAGES):
        for size, width, height in SIZES:
            enqueue(size, i + 1, width, height)

    otto.wait()  # waits until queue is empty or timeout is ellapsed
    def test_can_wait_when_urls_and_timeout(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait()

        expect(otto.ioloop.set_blocking_signal_threshold.called)
    def test_can_wait_when_urls_and_no_timeout(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait(0)

        logging_mock.assert_calls('Waiting for urls to be retrieved.')
Beispiel #5
0
    def test_can_wait_when_urls_and_no_timeout(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait(0)

        logging_mock.assert_calls('Waiting for urls to be retrieved.')
Beispiel #6
0
    def test_can_wait_when_urls_and_timeout(self):
        otto = TornadoOctopus(cache=False, auto_start=True)
        otto.ioloop = Mock()
        otto.running_urls = 10

        otto.wait()

        expect(otto.ioloop.set_blocking_signal_threshold.called)
Beispiel #7
0
    def test_can_enqueue_and_get_when_cache_miss(self):
        otto = TornadoOctopus(cache=True, auto_start=True)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')
        otto.wait(2)

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
    def test_can_enqueue_and_get_when_cache_miss(self):
        otto = TornadoOctopus(cache=True, auto_start=True)

        def response(url, response):
            self.url = url
            self.response = response

        otto.enqueue('http://www.google.com', response, method='GET')
        otto.wait(2)

        expect(otto.url_queue).to_be_empty()
        expect(self.response).not_to_be_null()
    def test_can_handle_exception(self, log_mock):
        url = 'http://www.globo.com'

        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            raise RuntimeError(url)

        otto.enqueue(url, handle_url_response)

        otto.wait(2)

        log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.')
    def test_can_handle_timeouts(self):
        url = 'http://baidu.com'
        otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
Beispiel #11
0
    def test_can_handle_exception(self, log_mock):
        url = 'http://www.globo.com'

        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            raise RuntimeError(url)

        otto.enqueue(url, handle_url_response)

        otto.wait(2)

        log_mock.assert_called_once_with(
            'Error calling callback for http://www.globo.com.')
Beispiel #12
0
    def test_can_handle_invalid_urls(self):
        url = 'http://kagdjdkjgka.fk'
        otto = TornadoOctopus(concurrency=1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response).not_to_be_null()
        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
    def test_can_handle_invalid_urls(self):
        url = 'http://kagdjdkjgka.fk'
        otto = TornadoOctopus(concurrency=1, auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response).not_to_be_null()
        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
    def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self):
        limiter = PerDomainInMemoryLimiter(
            {'http://g1.globo.com': 1},
            {'http://globoesporte.globo.com': 1}
        )
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(4)
        expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com'])
Beispiel #15
0
    def test_can_handle_timeouts(self):
        url = 'http://baidu.com'
        otto = TornadoOctopus(concurrency=1,
                              request_timeout_in_seconds=0.1,
                              auto_start=True)

        def handle_url_response(url, response):
            self.response = response

        otto.enqueue(url, handle_url_response)

        otto.wait(5)

        expect(self.response.status_code).to_equal(599)
        expect(self.response.text).to_be_null()
        expect(self.response.error).not_to_be_null()
    def test_should_call_limiter_miss_twice(self):
        limiter = PerDomainRedisLimiter(
            {'http://g1.globo.com': 1},
            {'http://globoesporte.globo.com': 1},
            redis=self.redis
        )
        limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)

        otto.wait()

        expect(self.cache_miss).to_length(2)
    def test_should_call_limiter_miss_twice(self):
        limiter = PerDomainRedisLimiter({'http://g1.globo.com': 1},
                                        {'http://globoesporte.globo.com': 1},
                                        redis=self.redis)
        limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com/',
                     self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/',
                     self.handle_url_response)
        otto.enqueue('http://g1.globo.com/', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)

        otto.wait()

        expect(self.cache_miss).to_length(2)
    def test_should_not_get_more_than_one_url_for_same_domain_concurrently(
            self):
        limiter = PerDomainInMemoryLimiter(
            {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1})
        otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)

        otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
        otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/',
                     self.handle_url_response)
        otto.enqueue('http://g1.globo.com', self.handle_url_response)
        otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(4)
        expect(list(limiter.domain_count.keys())).to_be_like(
            ['http://g1.globo.com', 'http://globoesporte.globo.com'])
Beispiel #19
0
def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False):
    message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % (
        ignore_pycurl and "using SimpleHTTPClient" or "using pycurl"
    )
    print
    print("=" * len(message))
    print(message)
    print("=" * len(message))
    print

    otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl)

    for url in urls_to_retrieve:
        otto.enqueue(url, handle_url_response)

    start_time = time()
    otto.wait(0)

    return time() - start_time
def get_avatars(urls):
    avatars = []

    otto = TornadoOctopus(
        concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60
    )

    def handle_url_response(url, response):
        if 'Not found' == response.text:
            print 'URL Not Found: %s' % url
        else:
            avatars.append(response.text)

    for url in urls:
        otto.enqueue(url, handle_url_response)

    otto.wait()

    return avatars
Beispiel #21
0
    def test_can_get_many_urls(self):
        urls = [
            'http://www.globo.com', 'http://www.twitter.com',
            'http://www.facebook.com'
        ]
        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            self.responses[url] = response

        for url in urls:
            otto.enqueue(url, handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(3)

        for url in urls:
            expect(self.responses).to_include(url)
            expect(self.responses[url].status_code).to_equal(200)
    def test_can_get_many_urls(self):
        urls = [
            'http://www.globo.com',
            'http://www.twitter.com',
            'http://www.facebook.com'
        ]
        otto = TornadoOctopus(concurrency=4, auto_start=True)

        def handle_url_response(url, response):
            self.responses[url] = response

        for url in urls:
            otto.enqueue(url, handle_url_response)

        otto.wait(2)

        expect(self.responses).to_length(3)

        for url in urls:
            expect(self.responses).to_include(url)
            expect(self.responses[url].status_code).to_equal(200)
def _download_url_list(image_url_list):
    '''Downloads the image sources of images listed on `image_url_list`
    '''

    images = []

    otto = TornadoOctopus(
        concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60
    )

    def handle_url_response(url, response):
        if 'Not found' == response.text:
            print url
        else:
            images.append(response.text)

    for url in image_url_list:
        otto.enqueue(url, handle_url_response)

    otto.wait(0)

    return images
    def test_can_wait_when_no_urls(self, logging_mock):
        otto = TornadoOctopus(cache=False, auto_start=True)

        otto.wait()

        logging_mock.assert_calls('No urls to wait for. Returning immediately.')