Beispiel #1
0
class ScheduledCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = ScheduledCall(self.obj.func, clock=self.clock,
                                *self.default_args,
                                **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing ScheduledCall without overriding its clock
        sc = ScheduledCall(self.obj.func, *self.default_args,
                           **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_get_time_and_is_scheduled(self):
        self.clock.advance(10)

        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)
        self.sc.schedule(5)
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 15)
        self.clock.advance(5)
        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)

    def test_no_delay(self):
        self.sc.schedule()
        self._check(None, None)
        self.clock.advance(0)
        self._check(self.default_args, self.default_kwargs)

    def test_default(self):
        self.assertTrue(self.sc.schedule(5))
        self._check(None, None)
        self.clock.advance(1)
        self.assertFalse(self.sc.schedule(1))
        self.clock.advance(2)
        self._check(None, None)
        self.clock.advance(3)
        self._check(self.default_args, self.default_kwargs)

    def test_cancel(self):
        self.sc.schedule(5)
        self.clock.advance(3)
        self.sc.cancel()
        self.clock.advance(3)
        self._check(None, None)
        self.assertTrue(self.sc.schedule(1))
        self.clock.advance(1)
        self._check(self.default_args, self.default_kwargs)

    def test_overwrite(self):
        over_args = ('crawlmi',)
        over_kwargs = {'a': 50, 'd': 'e'}
        self.sc.schedule(5, *over_args, **over_kwargs)
        self.clock.advance(5)
        self._check(over_args, over_kwargs)

    def test_partial_overwrite(self):
        over_args = ('crawlmi',)
        self.sc.schedule(5, *over_args)
        self.clock.advance(5)
        self._check(over_args, {})

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()
        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule_delay(self):
        args1 = ('a',)
        kwargs1 = {'a': 'b'}
        args2 = ('b',)
        kwargs2 = {'b': 'c'}

        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule(4, *args2, **kwargs2)
        self.sc.func = func
        self.sc.schedule(3, *args1, **kwargs1)
        self.clock.advance(3)
        self.assertIsNotNone(self.sc._call)
        self._check(args1, kwargs1)
        self.clock.advance(3)
        self._check(args1, kwargs1)
        self.clock.advance(1)
        self._check(args2, kwargs2)
Beispiel #2
0
class Slot(object):
    '''Slot represents a queue of requests for one particular domain.
    It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN.
    '''

    def __init__(self, download_handler, concurrency, delay, randomize_delay,
                 clock=None):
        self.download_handler = download_handler
        self.concurrency = concurrency
        self.delay = delay
        self.randomize_delay = randomize_delay
        self.in_progress = set()  # request waiting to be downloaded
        self.transferring = set()  # requests being downloaded (subset of `in_progress`)
        self.last_download_time = 0
        self.queue = MemoryQueue()  # queue of (request, deferred)
        # clock is used in unittests
        self.clock = clock or reactor
        self.delayed_processing = ScheduledCall(self._process, clock=self.clock)

    def enqueue(self, request, dfd):
        '''Main entry point.
        Put the new request to the queue and if possible, start downloading it.
        '''
        def remove_in_progress(response):
            self.in_progress.remove(request)
            return response
        self.in_progress.add(request)
        dfd.addBoth(remove_in_progress)
        self.queue.push((request, dfd))
        self._process()

    @property
    def free_slots(self):
        return self.concurrency - len(self.transferring)

    def is_idle(self):
        return len(self.in_progress) == 0

    def _process(self):
        '''Process the requests in the queue, while respecting the delay and
        concurrency.
        '''
        if self.delayed_processing.is_scheduled() or self._schedule_delay():
            return

        while self.queue and self.free_slots > 0:
            self.last_download_time = self.clock.seconds()
            request, downloaded_dfd = self.queue.pop()
            dfd = self._download(request)
            dfd.chainDeferred(downloaded_dfd)
            if self._schedule_delay():
                return

    def _schedule_delay(self):
        if self.delay:
            penalty = (self.last_download_time + self.get_download_delay() -
                       self.clock.seconds())
            if penalty > 0:
                # following schedule should always be successfull, because
                # `_schedule_delay()` is only called from within `_process()`
                self.delayed_processing.schedule(penalty)
                return True
        return False

    def _download(self, request):
        dfd = defer.succeed(request)
        # download the response
        dfd.addCallback(self.download_handler.download_request)

        # it is VERY important to wrap the failure into a new object!
        # For errors like ConnectionLost, the same Failure object is returned
        # everytime and we cannot use 'failure.request' field.
        def wrap_failure(failure):
            return Failure(failure.value)
        dfd.addErrback(wrap_failure)

        # put the request into the set of `transferring` to block other requests
        # after the response is downloaded, remove it from `transferring`
        def remove_transferring(response):
            self.transferring.remove(request)
            self._process()  # process unblocked requests
            return response
        self.transferring.add(request)
        dfd.addBoth(remove_transferring)
        return dfd

    def get_download_delay(self):
        if self.randomize_delay:
            return random.uniform(0.5 * self.delay, 1.5 * self.delay)
        return self.delay
class ScheduledCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = ScheduledCall(self.obj.func,
                                clock=self.clock,
                                *self.default_args,
                                **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing ScheduledCall without overriding its clock
        sc = ScheduledCall(self.obj.func, *self.default_args,
                           **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_get_time_and_is_scheduled(self):
        self.clock.advance(10)

        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)
        self.sc.schedule(5)
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 15)
        self.clock.advance(5)
        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)

    def test_no_delay(self):
        self.sc.schedule()
        self._check(None, None)
        self.clock.advance(0)
        self._check(self.default_args, self.default_kwargs)

    def test_default(self):
        self.assertTrue(self.sc.schedule(5))
        self._check(None, None)
        self.clock.advance(1)
        self.assertFalse(self.sc.schedule(1))
        self.clock.advance(2)
        self._check(None, None)
        self.clock.advance(3)
        self._check(self.default_args, self.default_kwargs)

    def test_cancel(self):
        self.sc.schedule(5)
        self.clock.advance(3)
        self.sc.cancel()
        self.clock.advance(3)
        self._check(None, None)
        self.assertTrue(self.sc.schedule(1))
        self.clock.advance(1)
        self._check(self.default_args, self.default_kwargs)

    def test_overwrite(self):
        over_args = ('crawlmi', )
        over_kwargs = {'a': 50, 'd': 'e'}
        self.sc.schedule(5, *over_args, **over_kwargs)
        self.clock.advance(5)
        self._check(over_args, over_kwargs)

    def test_partial_overwrite(self):
        over_args = ('crawlmi', )
        self.sc.schedule(5, *over_args)
        self.clock.advance(5)
        self._check(over_args, {})

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()

        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule_delay(self):
        args1 = ('a', )
        kwargs1 = {'a': 'b'}
        args2 = ('b', )
        kwargs2 = {'b': 'c'}

        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule(4, *args2, **kwargs2)

        self.sc.func = func
        self.sc.schedule(3, *args1, **kwargs1)
        self.clock.advance(3)
        self.assertIsNotNone(self.sc._call)
        self._check(args1, kwargs1)
        self.clock.advance(3)
        self._check(args1, kwargs1)
        self.clock.advance(1)
        self._check(args2, kwargs2)
Beispiel #4
0
class Slot(object):
    '''Slot represents a queue of requests for one particular domain.
    It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN.
    '''

    def __init__(self, download_handler, concurrency, delay, randomize_delay,
                 clock=None):
        self.download_handler = download_handler
        self.concurrency = concurrency
        self.delay = delay
        self.randomize_delay = randomize_delay
        self.in_progress = set()  # request waiting to be downloaded
        self.transferring = set()  # requests being downloaded (subset of `in_progress`)
        self.last_download_time = 0
        self.queue = MemoryQueue()  # queue of (request, deferred)
        # clock is used in unittests
        self.clock = clock or reactor
        self.delayed_processing = ScheduledCall(self._process, clock=self.clock)

    def enqueue(self, request, dfd):
        '''Main entry point.
        Put the new request to the queue and if possible, start downloading it.
        '''
        def remove_in_progress(response):
            self.in_progress.remove(request)
            return response
        self.in_progress.add(request)
        dfd.addBoth(remove_in_progress)
        self.queue.push((request, dfd))
        self._process()

    @property
    def free_slots(self):
        return self.concurrency - len(self.transferring)

    def is_idle(self):
        return len(self.in_progress) == 0

    def _process(self):
        '''Process the requests in the queue, while respecting the delay and
        concurrency.
        '''
        if self.delayed_processing.is_scheduled() or self._schedule_delay():
            return

        while self.queue and self.free_slots > 0:
            self.last_download_time = self.clock.seconds()
            request, downloaded_dfd = self.queue.pop()
            dfd = self._download(request)
            dfd.chainDeferred(downloaded_dfd)
            if self._schedule_delay():
                return

    def _schedule_delay(self):
        if self.delay:
            penalty = (self.last_download_time + self.get_download_delay() -
                       self.clock.seconds())
            if penalty > 0:
                # following schedule should always be successfull, because
                # `_schedule_delay()` is only called from within `_process()`
                self.delayed_processing.schedule(penalty)
                return True
        return False

    def _download(self, request):
        dfd = defer.succeed(request)
        # download the response
        dfd.addCallback(self.download_handler.download_request)

        # it is VERY important to wrap the failure into a new object!
        # For errors like ConnectionLost, the same Failure object is returned
        # everytime and we cannot use 'failure.request' field.
        def wrap_failure(failure):
            return Failure(failure.value)
        dfd.addErrback(wrap_failure)

        # put the request into the set of `transferring` to block other requests
        # after the response is downloaded, remove it from `transferring`
        def remove_transferring(response):
            self.transferring.remove(request)
            self._process()  # process unblocked requests
            return response
        self.transferring.add(request)
        dfd.addBoth(remove_transferring)
        return dfd

    def get_download_delay(self):
        if self.randomize_delay:
            return random.uniform(0.5 * self.delay, 1.5 * self.delay)
        return self.delay