Ejemplo n.º 1
0
 def setUp(self):
     self.clock = Clock()
     self.request_queue = MemoryQueue()
     self.response_queue = ResponseQueue()
     self.dwn = Downloader(Settings(self.default_settings),
                           self.request_queue,
                           self.response_queue,
                           download_handler=MockDownloaderHandler(
                               Settings()),
                           clock=self.clock)
     self.handler = self.dwn.download_handler
Ejemplo n.º 2
0
 def __init__(self, download_handler, concurrency, delay, randomize_delay,
              clock=None):
     self.download_handler = download_handler
     self.concurrency = concurrency
     self.delay = delay
     self.randomize_delay = randomize_delay
     self.in_progress = set()  # request waiting to be downloaded
     self.transferring = set()  # requests being downloaded (subset of `in_progress`)
     self.last_download_time = 0
     self.queue = MemoryQueue()  # queue of (request, deferred)
     # clock is used in unittests
     self.clock = clock or reactor
     self.delayed_processing = ScheduledCall(self._process, clock=self.clock)
Ejemplo n.º 3
0
 def setUp(self):
     self.clock = Clock()
     self.request_queue = MemoryQueue()
     self.response_queue = ResponseQueue()
     self.dwn = Downloader(Settings(self.default_settings), self.request_queue,
                           self.response_queue,
                           download_handler=MockDownloaderHandler(Settings()),
                           clock=self.clock)
     self.handler = self.dwn.download_handler
Ejemplo n.º 4
0
 def __init__(self, download_handler, concurrency, delay, randomize_delay,
              clock=None):
     self.download_handler = download_handler
     self.concurrency = concurrency
     self.delay = delay
     self.randomize_delay = randomize_delay
     self.in_progress = set()  # request waiting to be downloaded
     self.transferring = set()  # requests being downloaded (subset of `in_progress`)
     self.last_download_time = 0
     self.queue = MemoryQueue()  # queue of (request, deferred)
     # clock is used in unittests
     self.clock = clock or reactor
     self.delayed_processing = ScheduledCall(self._process, clock=self.clock)
Ejemplo n.º 5
0
    def setup(self):
        assert self.spider is not None, 'Spider is not set in Engine.'

        # IMPORTANT: order of the following initializations is very important
        # so please, think twice about any changes to it

        # initialize logging
        if self.settings.get_bool('LOG_ENABLED'):
            log.start(self.settings['LOG_FILE'], self.settings['LOG_LEVEL'],
                      self.settings['LOG_STDOUT'],
                      self.settings['LOG_ENCODING'])

        # initialize signals
        self.signals = SignalManager(self)

        #initialize stats
        stats_cls = load_object(self.settings.get('STATS_CLASS'))
        self.stats = stats_cls(self)

        # initialize downloader
        self.request_queue = PriorityQueue(lambda _: MemoryQueue())
        self.response_queue = ResponseQueue(
            self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT'))
        self.downloader = Downloader(self.settings,
                                     self.request_queue,
                                     self.response_queue,
                                     clock=self.clock)

        # initialize extensions
        self.extensions = ExtensionManager(self)
        # initialize downloader pipeline
        self.pipeline = PipelineManager(self)

        self.initialized = True

        # now that everything is ready, set the spider's engine
        self.spider.set_engine(self)
Ejemplo n.º 6
0
class Slot(object):
    '''Slot represents a queue of requests for one particular domain.
    It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN.
    '''

    def __init__(self, download_handler, concurrency, delay, randomize_delay,
                 clock=None):
        self.download_handler = download_handler
        self.concurrency = concurrency
        self.delay = delay
        self.randomize_delay = randomize_delay
        self.in_progress = set()  # request waiting to be downloaded
        self.transferring = set()  # requests being downloaded (subset of `in_progress`)
        self.last_download_time = 0
        self.queue = MemoryQueue()  # queue of (request, deferred)
        # clock is used in unittests
        self.clock = clock or reactor
        self.delayed_processing = ScheduledCall(self._process, clock=self.clock)

    def enqueue(self, request, dfd):
        '''Main entry point.
        Put the new request to the queue and if possible, start downloading it.
        '''
        def remove_in_progress(response):
            self.in_progress.remove(request)
            return response
        self.in_progress.add(request)
        dfd.addBoth(remove_in_progress)
        self.queue.push((request, dfd))
        self._process()

    @property
    def free_slots(self):
        return self.concurrency - len(self.transferring)

    def is_idle(self):
        return len(self.in_progress) == 0

    def _process(self):
        '''Process the requests in the queue, while respecting the delay and
        concurrency.
        '''
        if self.delayed_processing.is_scheduled() or self._schedule_delay():
            return

        while self.queue and self.free_slots > 0:
            self.last_download_time = self.clock.seconds()
            request, downloaded_dfd = self.queue.pop()
            dfd = self._download(request)
            dfd.chainDeferred(downloaded_dfd)
            if self._schedule_delay():
                return

    def _schedule_delay(self):
        if self.delay:
            penalty = (self.last_download_time + self.get_download_delay() -
                       self.clock.seconds())
            if penalty > 0:
                # following schedule should always be successfull, because
                # `_schedule_delay()` is only called from within `_process()`
                self.delayed_processing.schedule(penalty)
                return True
        return False

    def _download(self, request):
        dfd = defer.succeed(request)
        # download the response
        dfd.addCallback(self.download_handler.download_request)

        # it is VERY important to wrap the failure into a new object!
        # For errors like ConnectionLost, the same Failure object is returned
        # everytime and we cannot use 'failure.request' field.
        def wrap_failure(failure):
            return Failure(failure.value)
        dfd.addErrback(wrap_failure)

        # put the request into the set of `transferring` to block other requests
        # after the response is downloaded, remove it from `transferring`
        def remove_transferring(response):
            self.transferring.remove(request)
            self._process()  # process unblocked requests
            return response
        self.transferring.add(request)
        dfd.addBoth(remove_transferring)
        return dfd

    def get_download_delay(self):
        if self.randomize_delay:
            return random.uniform(0.5 * self.delay, 1.5 * self.delay)
        return self.delay
Ejemplo n.º 7
0
class DownloaderTest(unittest.TestCase):

    default_settings = {
        'CONCURRENT_REQUESTS': 2,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 0,
        'RANDOMIZE_DOWNLOAD_DELAY': False}

    def setUp(self):
        self.clock = Clock()
        self.request_queue = MemoryQueue()
        self.response_queue = ResponseQueue()
        self.dwn = Downloader(Settings(self.default_settings), self.request_queue,
                              self.response_queue,
                              download_handler=MockDownloaderHandler(Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def _update_dwn(self, **kwargs):
        '''Update downloader with the new settings.
        '''
        new_settings = self.default_settings.copy()
        new_settings.update(**kwargs)
        self.dwn.processing.cancel()
        self.dwn = Downloader(Settings(new_settings), self.request_queue, self.response_queue,
                              download_handler=MockDownloaderHandler(Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def test_concurrency(self):
        # standard situation
        self._update_dwn()
        self.assertEqual(self.dwn.total_concurrency, 2)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertTrue(self.dwn.use_domain_specific)
        # delay set
        self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=5,
                      DOWNLOAD_DELAY=5)
        self.assertEqual(self.dwn.total_concurrency, 1)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is 0
        self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=0)
        self.assertEqual(self.dwn.total_concurrency, 10)
        self.assertEqual(self.dwn.domain_concurrency, 10)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is too big
        self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=10)
        self.assertEqual(self.dwn.total_concurrency, 5)
        self.assertEqual(self.dwn.domain_concurrency, 5)
        self.assertFalse(self.dwn.use_domain_specific)
        self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=5)
        self.assertFalse(self.dwn.use_domain_specific)

    def test_get_slot(self):
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, 'www.github.com')
        key2, slot2 = self.dwn._get_slot(Request('http://www.github.com/hello/world#bla'))
        self.assertEqual(key2, 'www.github.com')
        self.assertIs(slot2, slot)
        key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertEqual(key3, 'sites.github.com')
        self.assertIsNot(slot3, slot)
        self.assertEqual(len(self.dwn.slots), 2)

        # don't use domain specific slots
        self.dwn.use_domain_specific = False
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, '')
        key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertIs(slot2, slot)

    def test_basic(self):
        # create 5 requests with slot ids: a, b, a, a, c
        requests = [get_request(id)[0] for id in 'abaac']
        map(lambda r: self.request_queue.push(r), requests)
        self.assertEqual(self.dwn.free_slots, 2)
        self.assertTrue(self.dwn.is_idle())

        # start downloading first two requests
        self.clock.advance(0)
        self.assertEqual(self.dwn.free_slots, 0)
        self.assertFalse(self.dwn.is_idle())
        # no more requests are scheduled, until download is finished
        self.clock.advance(20)
        self.assertEqual(len(self.request_queue), 3)
        # download the first request
        self.handler.call(requests[0], Response('hello'))
        self.assertEqual(self.dwn.free_slots, 1)  # slot is immediately available
        # result is also available
        result = self.response_queue.peek()
        self.assertIs(result.request, requests[0])
        self.assertEqual(result.url, 'hello')
        # enqueue third request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # download second request
        self.handler.call(requests[1], Response(''))
        # enqueue fourth request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # fourth request should not begin download, until 3rd request is done
        self.assertRaises(KeyError, self.handler.call, requests[3], Response(''))
        # finish
        self.handler.call(requests[2], Response(''))
        self.handler.call(requests[3], Response(''))
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.handler.call(requests[4], Response(''))
        # final checks
        self.clock.pump([1] * 10)
        self.assertEqual(len(self.response_queue), 5)
        self.assertTrue(self.dwn.is_idle())

    def test_close(self):
        req1 = get_request('a')[0]
        req2 = get_request('b')[0]
        self.request_queue.push(req1)
        self.clock.advance(20)
        self.request_queue.push(req2)
        # test basic attributes, before and after closing
        self.assertTrue(self.dwn.running)
        self.assertTrue(self.dwn.processing.is_scheduled())
        self.dwn.close()
        self.assertFalse(self.dwn.running)
        self.assertFalse(self.dwn.processing.is_scheduled())

        self.clock.advance(20)
        self.assertEqual(len(self.request_queue), 1)  # request 2 remains unqueued

        # downloader behavior after closing
        self.assertEqual(len(self.response_queue), 0)
        self.handler.call(req1, Response(''))
        self.assertEqual(len(self.response_queue), 0)

    def test_fail(self):
        self._update_dwn(CONCURRENT_REQUESTS=3, CONCURRENT_REQUESTS_PER_DOMAIN=2)
        requests = [get_request(id)[0] for id in 'aab']
        map(lambda r: self.request_queue.push(r), requests)

        # enqueue requests
        self.clock.advance(0)
        # fail 1st request
        err = ValueError('my bad')
        self.handler.fail(requests[0], err)
        self.assertEqual(self.dwn.free_slots, 1)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[0])
        self.assertIs(fail.value, err)
        # fail 3rd request
        self.handler.fail(requests[2], err)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[2])
        self.assertIs(fail.value, err)
        # succeed 2nd request
        self.handler.call(requests[1], Response('nice!', request=requests[1]))
        resp = self.response_queue.pop()
        self.assertIs(resp.request, requests[1])
        self.assertEqual(resp.url, 'nice!')

    def test_clear_slots(self):
        requests = [get_request(id)[0] for id in xrange(30)]
        for r in requests:
            self.request_queue.push(r)
            self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
            self.handler.call(r, Response(''))
        self.assertLessEqual(len(self.dwn.slots), 2 * self.dwn.total_concurrency)
Ejemplo n.º 8
0
class DownloaderTest(unittest.TestCase):

    default_settings = {
        'CONCURRENT_REQUESTS': 2,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 0,
        'RANDOMIZE_DOWNLOAD_DELAY': False
    }

    def setUp(self):
        self.clock = Clock()
        self.request_queue = MemoryQueue()
        self.response_queue = ResponseQueue()
        self.dwn = Downloader(Settings(self.default_settings),
                              self.request_queue,
                              self.response_queue,
                              download_handler=MockDownloaderHandler(
                                  Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def _update_dwn(self, **kwargs):
        '''Update downloader with the new settings.
        '''
        new_settings = self.default_settings.copy()
        new_settings.update(**kwargs)
        self.dwn.processing.cancel()
        self.dwn = Downloader(Settings(new_settings),
                              self.request_queue,
                              self.response_queue,
                              download_handler=MockDownloaderHandler(
                                  Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def test_concurrency(self):
        # standard situation
        self._update_dwn()
        self.assertEqual(self.dwn.total_concurrency, 2)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertTrue(self.dwn.use_domain_specific)
        # delay set
        self._update_dwn(CONCURRENT_REQUESTS=10,
                         CONCURRENT_REQUESTS_PER_DOMAIN=5,
                         DOWNLOAD_DELAY=3.14)
        self.assertEqual(self.dwn.download_delay, 3.14)
        self.assertEqual(self.dwn.total_concurrency, 1)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is 0
        self._update_dwn(CONCURRENT_REQUESTS=10,
                         CONCURRENT_REQUESTS_PER_DOMAIN=0)
        self.assertEqual(self.dwn.total_concurrency, 10)
        self.assertEqual(self.dwn.domain_concurrency, 10)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is too big
        self._update_dwn(CONCURRENT_REQUESTS=5,
                         CONCURRENT_REQUESTS_PER_DOMAIN=10)
        self.assertEqual(self.dwn.total_concurrency, 5)
        self.assertEqual(self.dwn.domain_concurrency, 5)
        self.assertFalse(self.dwn.use_domain_specific)
        self._update_dwn(CONCURRENT_REQUESTS=5,
                         CONCURRENT_REQUESTS_PER_DOMAIN=5)
        self.assertFalse(self.dwn.use_domain_specific)

    def test_get_slot(self):
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, 'www.github.com')
        key2, slot2 = self.dwn._get_slot(
            Request('http://www.github.com/hello/world#bla'))
        self.assertEqual(key2, 'www.github.com')
        self.assertIs(slot2, slot)
        key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertEqual(key3, 'sites.github.com')
        self.assertIsNot(slot3, slot)
        self.assertEqual(len(self.dwn.slots), 2)

        # don't use domain specific slots
        self.dwn.use_domain_specific = False
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, '')
        key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertIs(slot2, slot)

    def test_basic(self):
        # create 5 requests with slot ids: a, b, a, a, c
        requests = [get_request(id)[0] for id in 'abaac']
        map(lambda r: self.request_queue.push(r), requests)
        self.assertEqual(self.dwn.free_slots, 2)
        self.assertTrue(self.dwn.is_idle())

        # start downloading first two requests
        self.clock.advance(0)
        self.assertEqual(self.dwn.free_slots, 0)
        self.assertFalse(self.dwn.is_idle())
        # no more requests are scheduled, until download is finished
        self.clock.advance(20)
        self.assertEqual(len(self.request_queue), 3)
        # download the first request
        self.handler.call(requests[0], Response('hello'))
        self.assertEqual(self.dwn.free_slots,
                         1)  # slot is immediately available
        # result is also available
        result = self.response_queue.peek()
        self.assertIs(result.request, requests[0])
        self.assertEqual(result.url, 'hello')
        # enqueue third request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # download second request
        self.handler.call(requests[1], Response(''))
        # enqueue fourth request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # fourth request should not begin download, until 3rd request is done
        self.assertRaises(KeyError, self.handler.call, requests[3],
                          Response(''))
        # finish
        self.handler.call(requests[2], Response(''))
        self.handler.call(requests[3], Response(''))
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.handler.call(requests[4], Response(''))
        # final checks
        self.clock.pump([1] * 10)
        self.assertEqual(len(self.response_queue), 5)
        self.assertTrue(self.dwn.is_idle())

    def test_close(self):
        req1 = get_request('a')[0]
        req2 = get_request('b')[0]
        self.request_queue.push(req1)
        self.clock.advance(20)
        self.request_queue.push(req2)
        # test basic attributes, before and after closing
        self.assertTrue(self.dwn.running)
        self.assertTrue(self.dwn.processing.is_scheduled())
        self.dwn.close()
        self.assertFalse(self.dwn.running)
        self.assertFalse(self.dwn.processing.is_scheduled())

        self.clock.advance(20)
        self.assertEqual(len(self.request_queue),
                         1)  # request 2 remains unqueued

        # downloader behavior after closing
        self.assertEqual(len(self.response_queue), 0)
        self.handler.call(req1, Response(''))
        self.assertEqual(len(self.response_queue), 0)

    def test_fail(self):
        self._update_dwn(CONCURRENT_REQUESTS=3,
                         CONCURRENT_REQUESTS_PER_DOMAIN=2)
        requests = [get_request(id)[0] for id in 'aab']
        map(lambda r: self.request_queue.push(r), requests)

        # enqueue requests
        self.clock.advance(0)
        # fail 1st request
        err = ValueError('my bad')
        self.handler.fail(requests[0], err)
        self.assertEqual(self.dwn.free_slots, 1)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[0])
        self.assertIs(fail.value, err)
        # fail 3rd request
        self.handler.fail(requests[2], err)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[2])
        self.assertIs(fail.value, err)
        # succeed 2nd request
        self.handler.call(requests[1], Response('nice!', request=requests[1]))
        resp = self.response_queue.pop()
        self.assertIs(resp.request, requests[1])
        self.assertEqual(resp.url, 'nice!')

    def test_clear_slots(self):
        requests = [get_request(id)[0] for id in xrange(30)]
        for r in requests:
            self.request_queue.push(r)
            self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
            self.handler.call(r, Response(''))
        self.assertLessEqual(len(self.dwn.slots),
                             2 * self.dwn.total_concurrency)
Ejemplo n.º 9
0
def qfactory(priority):
    return MemoryQueue()
Ejemplo n.º 10
0
class Slot(object):
    '''Slot represents a queue of requests for one particular domain.
    It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN.
    '''

    def __init__(self, download_handler, concurrency, delay, randomize_delay,
                 clock=None):
        self.download_handler = download_handler
        self.concurrency = concurrency
        self.delay = delay
        self.randomize_delay = randomize_delay
        self.in_progress = set()  # request waiting to be downloaded
        self.transferring = set()  # requests being downloaded (subset of `in_progress`)
        self.last_download_time = 0
        self.queue = MemoryQueue()  # queue of (request, deferred)
        # clock is used in unittests
        self.clock = clock or reactor
        self.delayed_processing = ScheduledCall(self._process, clock=self.clock)

    def enqueue(self, request, dfd):
        '''Main entry point.
        Put the new request to the queue and if possible, start downloading it.
        '''
        def remove_in_progress(response):
            self.in_progress.remove(request)
            return response
        self.in_progress.add(request)
        dfd.addBoth(remove_in_progress)
        self.queue.push((request, dfd))
        self._process()

    @property
    def free_slots(self):
        return self.concurrency - len(self.transferring)

    def is_idle(self):
        return len(self.in_progress) == 0

    def _process(self):
        '''Process the requests in the queue, while respecting the delay and
        concurrency.
        '''
        if self.delayed_processing.is_scheduled() or self._schedule_delay():
            return

        while self.queue and self.free_slots > 0:
            self.last_download_time = self.clock.seconds()
            request, downloaded_dfd = self.queue.pop()
            dfd = self._download(request)
            dfd.chainDeferred(downloaded_dfd)
            if self._schedule_delay():
                return

    def _schedule_delay(self):
        if self.delay:
            penalty = (self.last_download_time + self.get_download_delay() -
                       self.clock.seconds())
            if penalty > 0:
                # following schedule should always be successfull, because
                # `_schedule_delay()` is only called from within `_process()`
                self.delayed_processing.schedule(penalty)
                return True
        return False

    def _download(self, request):
        dfd = defer.succeed(request)
        # download the response
        dfd.addCallback(self.download_handler.download_request)

        # it is VERY important to wrap the failure into a new object!
        # For errors like ConnectionLost, the same Failure object is returned
        # everytime and we cannot use 'failure.request' field.
        def wrap_failure(failure):
            return Failure(failure.value)
        dfd.addErrback(wrap_failure)

        # put the request into the set of `transferring` to block other requests
        # after the response is downloaded, remove it from `transferring`
        def remove_transferring(response):
            self.transferring.remove(request)
            self._process()  # process unblocked requests
            return response
        self.transferring.add(request)
        dfd.addBoth(remove_transferring)
        return dfd

    def get_download_delay(self):
        if self.randomize_delay:
            return random.uniform(0.5 * self.delay, 1.5 * self.delay)
        return self.delay