def setUp(self):
     self.clock = Clock()
     self.engine = get_engine(LOG_STATS_INTERVAL=30)
     self.engine.signals = SignalManager(self.engine)
     self.ls = LogStats(self.engine, clock=self.clock)
     self.lw = LogWrapper()
     self.lw.setUp()
 def setUp(self):
     self.clock = Clock()
     self.obj = ModifiedObject()
     self.sc = ScheduledCall(self.obj.func,
                             clock=self.clock,
                             *self.default_args,
                             **self.default_kwargs)
class LogStatsTest(unittest.TestCase):
    def setUp(self):
        self.clock = Clock()
        self.engine = get_engine(LOG_STATS_INTERVAL=30)
        self.engine.signals = SignalManager(self.engine)
        self.ls = LogStats(self.engine, clock=self.clock)
        self.lw = LogWrapper()
        self.lw.setUp()

    def tearDown(self):
        self.lw.tearDown()

    def test_config(self):
        self.assertRaises(NotConfigured, LogStats, get_engine(LOG_STATS_INTERVAL=0))

    def test_basic(self):
        # engine is stopped
        self.clock.advance(60)
        self.assertEqual(self.lw.get_first_line(), '')
        # start the engine
        self.engine.signals.send(signals.engine_started)
        self.clock.advance(29)
        self.assertEqual(self.lw.get_first_line(), '')
        self.clock.advance(1)
        self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Crawled 0 pages (at 0 pages/min).')
        # download some responses
        self.engine.signals.send(signals.response_downloaded, response=Response(url=''))
        self.engine.signals.send(signals.response_downloaded, response=Response(url=''))
        self.engine.signals.send(signals.response_received, response=Response(url=''))
        self.clock.advance(30)
        self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Crawled 2 pages (at 4 pages/min).')
        # stop the engine
        self.engine.signals.send(signals.engine_stopped)
        self.clock.advance(60)
        self.assertEqual(self.lw.get_first_line(), '')
Example #4
0
 def setUp(self):
     self.handler = MockDownloaderHandler(Settings())
     self.clock = Clock()
     self.slot = Slot(self.handler,
                      self.default_concurrency,
                      self.default_delay,
                      self.default_randomize_delay,
                      clock=self.clock)
Example #5
0
 def setUp(self):
     self.clock = Clock()
     self.request_queue = MemoryQueue()
     self.response_queue = ResponseQueue()
     self.dwn = Downloader(Settings(self.default_settings),
                           self.request_queue,
                           self.response_queue,
                           download_handler=MockDownloaderHandler(
                               Settings()),
                           clock=self.clock)
     self.handler = self.dwn.download_handler
 def setUp(self):
     self.clock = Clock()
     self.engine = get_engine(LOG_STATS_INTERVAL=30)
     self.engine.signals = SignalManager(self.engine)
     self.ls = LogStats(self.engine, clock=self.clock)
     self.lw = LogWrapper()
     self.lw.setUp()
Example #7
0
 def setUp(self):
     self.clock = Clock()
     self.request_queue = MemoryQueue()
     self.response_queue = ResponseQueue()
     self.dwn = Downloader(Settings(self.default_settings), self.request_queue,
                           self.response_queue,
                           download_handler=MockDownloaderHandler(Settings()),
                           clock=self.clock)
     self.handler = self.dwn.download_handler
class LogStatsTest(unittest.TestCase):
    def setUp(self):
        self.clock = Clock()
        self.engine = get_engine(LOG_STATS_INTERVAL=30)
        self.engine.signals = SignalManager(self.engine)
        self.ls = LogStats(self.engine, clock=self.clock)
        self.lw = LogWrapper()
        self.lw.setUp()

    def tearDown(self):
        self.lw.tearDown()

    def test_config(self):
        self.assertRaises(NotConfigured, LogStats,
                          get_engine(LOG_STATS_INTERVAL=0))

    def test_basic(self):
        # engine is stopped
        self.clock.advance(60)
        self.assertEqual(self.lw.get_first_line(), '')
        # start the engine
        self.engine.signals.send(signals.engine_started)
        self.clock.advance(29)
        self.assertEqual(self.lw.get_first_line(), '')
        self.clock.advance(1)
        self.assertEqual(self.lw.get_first_line(),
                         '[crawlmi] INFO: Crawled 0 pages (at 0 pages/min).')
        # download some responses
        self.engine.signals.send(signals.response_downloaded,
                                 response=Response(url=''))
        self.engine.signals.send(signals.response_downloaded,
                                 response=Response(url=''))
        self.engine.signals.send(signals.response_received,
                                 response=Response(url=''))
        self.clock.advance(30)
        self.assertEqual(self.lw.get_first_line(),
                         '[crawlmi] INFO: Crawled 2 pages (at 4 pages/min).')
        # stop the engine
        self.engine.signals.send(signals.engine_stopped)
        self.clock.advance(60)
        self.assertEqual(self.lw.get_first_line(), '')
Example #9
0
def get_engine(custom_settings=None, **kwargs):
    '''Return the engine initialized with the custom settings.
    '''
    custom_settings = custom_settings or {}
    custom_settings.update(kwargs)
    settings = EngineSettings(custom_settings=custom_settings)
    engine = Engine(settings, Project(path=None), clock=Clock())
    engine.set_spider(BaseSpider('dummy'))
    engine.stop_if_idle = False
    # it is common to use stats and signals in unittests, without full
    # initialization of the engine
    engine.stats = MemoryStats(engine)
    engine.signals = SignalManager(engine)
    return engine
Example #10
0
class LoopingCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = LoopingCall(self.obj.func, clock=self.clock,
                              *self.default_args,
                              **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing LoopingCall without overriding its clock
        sc = LoopingCall(self.obj.func, *self.default_args,
                           **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_basic(self):
        # scheduling
        self.assertFalse(self.sc.is_scheduled())
        self.sc.schedule(2, count=2, now=False)
        # before the first call
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 2)
        self.assertEqual(self.sc.calls_left(), 2)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 0)
        # after the first call
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)
        self.assertEqual(self.sc.calls_left(), 1)
        self._check(self.default_args, self.default_kwargs)
        # after the second call
        self.clock.advance(2)
        self.assertEqual(self.obj.num_calls, 2)
        self.assertEqual(self.sc.calls_left(), 0)
        self.assertFalse(self.sc.is_scheduled())
        # no more calls
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_now(self):
        self.sc.schedule(2, count=2, now=True)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_infinite(self):
        self.sc.schedule(2)
        self.clock.pump([2] * 100)
        self.assertEqual(self.obj.num_calls, 100)
        self.assertTrue(self.sc.is_scheduled())
        self.assertIsNone(self.sc.calls_left())

    def test_cancel(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.cancel()
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 0)

    def test_reschedule(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.schedule(5)
        self.clock.advance(4)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)

    def test_no_delay(self):
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()
        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)
class ScheduledCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = ScheduledCall(self.obj.func,
                                clock=self.clock,
                                *self.default_args,
                                **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing ScheduledCall without overriding its clock
        sc = ScheduledCall(self.obj.func, *self.default_args,
                           **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_get_time_and_is_scheduled(self):
        self.clock.advance(10)

        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)
        self.sc.schedule(5)
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 15)
        self.clock.advance(5)
        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)

    def test_no_delay(self):
        self.sc.schedule()
        self._check(None, None)
        self.clock.advance(0)
        self._check(self.default_args, self.default_kwargs)

    def test_default(self):
        self.assertTrue(self.sc.schedule(5))
        self._check(None, None)
        self.clock.advance(1)
        self.assertFalse(self.sc.schedule(1))
        self.clock.advance(2)
        self._check(None, None)
        self.clock.advance(3)
        self._check(self.default_args, self.default_kwargs)

    def test_cancel(self):
        self.sc.schedule(5)
        self.clock.advance(3)
        self.sc.cancel()
        self.clock.advance(3)
        self._check(None, None)
        self.assertTrue(self.sc.schedule(1))
        self.clock.advance(1)
        self._check(self.default_args, self.default_kwargs)

    def test_overwrite(self):
        over_args = ('crawlmi', )
        over_kwargs = {'a': 50, 'd': 'e'}
        self.sc.schedule(5, *over_args, **over_kwargs)
        self.clock.advance(5)
        self._check(over_args, over_kwargs)

    def test_partial_overwrite(self):
        over_args = ('crawlmi', )
        self.sc.schedule(5, *over_args)
        self.clock.advance(5)
        self._check(over_args, {})

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()

        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule_delay(self):
        args1 = ('a', )
        kwargs1 = {'a': 'b'}
        args2 = ('b', )
        kwargs2 = {'b': 'c'}

        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule(4, *args2, **kwargs2)

        self.sc.func = func
        self.sc.schedule(3, *args1, **kwargs1)
        self.clock.advance(3)
        self.assertIsNotNone(self.sc._call)
        self._check(args1, kwargs1)
        self.clock.advance(3)
        self._check(args1, kwargs1)
        self.clock.advance(1)
        self._check(args2, kwargs2)
class LoopingCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = LoopingCall(self.obj.func,
                              clock=self.clock,
                              *self.default_args,
                              **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing LoopingCall without overriding its clock
        sc = LoopingCall(self.obj.func, *self.default_args,
                         **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_basic(self):
        # scheduling
        self.assertFalse(self.sc.is_scheduled())
        self.sc.schedule(2, count=2, now=False)
        # before the first call
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 2)
        self.assertEqual(self.sc.calls_left(), 2)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 0)
        # after the first call
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)
        self.assertEqual(self.sc.calls_left(), 1)
        self._check(self.default_args, self.default_kwargs)
        # after the second call
        self.clock.advance(2)
        self.assertEqual(self.obj.num_calls, 2)
        self.assertEqual(self.sc.calls_left(), 0)
        self.assertFalse(self.sc.is_scheduled())
        # no more calls
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_now(self):
        self.sc.schedule(2, count=2, now=True)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_infinite(self):
        self.sc.schedule(2)
        self.clock.pump([2] * 100)
        self.assertEqual(self.obj.num_calls, 100)
        self.assertTrue(self.sc.is_scheduled())
        self.assertIsNone(self.sc.calls_left())

    def test_cancel(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.cancel()
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 0)

    def test_reschedule(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.schedule(5)
        self.clock.advance(4)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)

    def test_no_delay(self):
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()

        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)
Example #13
0
 def setUp(self):
     self.handler = MockDownloaderHandler(Settings())
     self.clock = Clock()
     self.slot = Slot(self.handler, self.default_concurrency,
                      self.default_delay, self.default_randomize_delay,
                      clock=self.clock)
Example #14
0
class DownloaderSlotTest(unittest.TestCase):
    default_concurrency = 2
    default_delay = 0
    default_randomize_delay = False

    def setUp(self):
        self.handler = MockDownloaderHandler(Settings())
        self.clock = Clock()
        self.slot = Slot(self.handler, self.default_concurrency,
                         self.default_delay, self.default_randomize_delay,
                         clock=self.clock)

    def test_basic(self):
        received = []
        def downloaded(result):
            received.append(result)

        # enqueue 3 requests
        r1, dfd1 = get_request('1', func=downloaded)
        self.slot.enqueue(r1, dfd1)
        self.assertEqual(len(self.slot.in_progress), 1)
        self.assertEqual(len(self.slot.transferring), 1)
        r2, dfd2 = get_request('2', func=downloaded)
        r3, dfd3 = get_request('3', func=downloaded)
        self.slot.enqueue(r2, dfd2)
        self.slot.enqueue(r3, dfd3)
        self.assertEqual(len(self.slot.in_progress), 3)
        self.assertEqual(len(self.slot.transferring), 2)
        self.assertEqual(self.slot.free_slots, 0)

        # download r2
        self.handler.call(r2, Response(''))
        self.assertIs(received[-1].request, r2)
        self.assertEqual(len(self.slot.transferring), 2)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(self.slot.free_slots, 0)

        # download r1 and r3
        self.handler.call(r3, Response(''))
        self.handler.call(r1, Response(''))
        self.assertIs(received[-2].request, r3)
        self.assertIs(received[-1].request, r1)
        self.assertEqual(self.slot.free_slots, 2)

        # nothing happens now
        self.clock.advance(5)
        self.assertEqual(len(self.slot.in_progress), 0)
        self.assertEqual(self.slot.free_slots, 2)

    def test_delay(self):
        self.slot.concurrency = 1
        self.slot.delay = 5
        self.clock.advance(10)  # so we don't start on time 0

        # enqueue 3 requests
        r1, dfd1 = get_request('1')
        self.slot.enqueue(r1, dfd1)
        r2, dfd2 = get_request('2')
        self.slot.enqueue(r2, dfd2)
        r3, dfd3 = get_request('3')
        self.slot.enqueue(r3, dfd3)
        self.assertEqual(len(self.slot.in_progress), 3)
        self.assertEqual(len(self.slot.transferring), 1)
        self.assertEqual(self.slot.last_download_time, 10)
        self.assertEqual(self.slot.delayed_processing.get_time(), 15)

        # download the 1st request
        self.handler.call(r1, Response(''))
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 0)
        self.assertEqual(self.slot.free_slots, 1)
        # we should still wait
        self.clock.advance(3)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 0)
        self.assertEqual(self.slot.free_slots, 1)
        # make the 2nd request downloading
        self.clock.advance(3)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 1)
        self.assertEqual(self.slot.free_slots, 0)
        self.assertEqual(self.slot.last_download_time, 16)
        self.assertEqual(self.slot.delayed_processing.get_time(), 21)
        # wait and nothing happens
        self.clock.advance(10)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 1)
        self.assertFalse(self.slot.delayed_processing.is_scheduled())

    def test_random_delay(self):
        self.slot.delay = 5
        self.slot.randomize_delay = False
        delays = [self.slot.get_download_delay() for x in xrange(10)]
        self.assertTrue(all(x == 5 for x in delays))

        self.slot.randomize_delay = True
        lower = 0.5 * 5
        upper = 1.5 * 5
        delays = [self.slot.get_download_delay() for x in xrange(100)]
        self.assertTrue(all(lower <= x <= upper for x in delays))

        third1 = (2 * lower + upper) / 3
        third2 = (lower + 2 * upper) / 3
        self.assertTrue(any(x <= third1 for x in delays))
        self.assertTrue(any(third1 <= x <= third2 for x in delays))
        self.assertTrue(any(third2 <= x for x in delays))

    def test_fail(self):
        received = []
        def downloaded(result):
            received.append(result)

        # enqueue 3 requests
        r1, dfd1 = get_request('1', func=downloaded)
        self.slot.enqueue(r1, dfd1)
        r2, dfd2 = get_request('2', func=downloaded)
        self.slot.enqueue(r2, dfd2)
        r3, dfd3 = get_request('3', func=downloaded)
        self.slot.enqueue(r3, dfd3)
        # fail the first request
        err = ValueError('my bad')
        self.handler.fail(r1, err)
        self.assertEqual(received[-1].value, err)
        # other requests should be ok
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 2)
        self.handler.call(r2, Response(''))
        self.assertEqual(received[-1].request, r2)
        self.handler.call(r3, Response(''))
        self.assertEqual(received[-1].request, r3)
        self.assertEqual(len(self.slot.in_progress), 0)
        self.assertEqual(len(self.slot.transferring), 0)

    def test_exception(self):
        self.slot.download_handler = ExceptionDownloaderHandler(Settings())
        r1, dfd1 = get_request('1')
        self.slot.enqueue(r1, dfd1)
        return self.assertFailure(dfd1, Exception)

    def test_failure(self):
        self.slot.download_handler = FailureDownloaderHandler(Settings())
        download_values = []
        def downloaded(value):
            download_values.append(value)

        for i in xrange(2):
            r, dfd = get_request(str(i))
            dfd.addBoth(downloaded)
            self.slot.enqueue(r, dfd)

        self.assertEqual(len(download_values), 2)
        self.assertIsInstance(download_values[0], Failure)
        self.assertIsInstance(download_values[1], Failure)
        self.assertIsNot(download_values[0], download_values[1])
        self.assertIsInstance(download_values[0].value, ValueError)
Example #15
0
class ScheduledCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = ScheduledCall(self.obj.func, clock=self.clock,
                                *self.default_args,
                                **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing ScheduledCall without overriding its clock
        sc = ScheduledCall(self.obj.func, *self.default_args,
                           **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_get_time_and_is_scheduled(self):
        self.clock.advance(10)

        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)
        self.sc.schedule(5)
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 15)
        self.clock.advance(5)
        self.assertFalse(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 0)

    def test_no_delay(self):
        self.sc.schedule()
        self._check(None, None)
        self.clock.advance(0)
        self._check(self.default_args, self.default_kwargs)

    def test_default(self):
        self.assertTrue(self.sc.schedule(5))
        self._check(None, None)
        self.clock.advance(1)
        self.assertFalse(self.sc.schedule(1))
        self.clock.advance(2)
        self._check(None, None)
        self.clock.advance(3)
        self._check(self.default_args, self.default_kwargs)

    def test_cancel(self):
        self.sc.schedule(5)
        self.clock.advance(3)
        self.sc.cancel()
        self.clock.advance(3)
        self._check(None, None)
        self.assertTrue(self.sc.schedule(1))
        self.clock.advance(1)
        self._check(self.default_args, self.default_kwargs)

    def test_overwrite(self):
        over_args = ('crawlmi',)
        over_kwargs = {'a': 50, 'd': 'e'}
        self.sc.schedule(5, *over_args, **over_kwargs)
        self.clock.advance(5)
        self._check(over_args, over_kwargs)

    def test_partial_overwrite(self):
        over_args = ('crawlmi',)
        self.sc.schedule(5, *over_args)
        self.clock.advance(5)
        self._check(over_args, {})

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()
        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule_delay(self):
        args1 = ('a',)
        kwargs1 = {'a': 'b'}
        args2 = ('b',)
        kwargs2 = {'b': 'c'}

        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule(4, *args2, **kwargs2)
        self.sc.func = func
        self.sc.schedule(3, *args1, **kwargs1)
        self.clock.advance(3)
        self.assertIsNotNone(self.sc._call)
        self._check(args1, kwargs1)
        self.clock.advance(3)
        self._check(args1, kwargs1)
        self.clock.advance(1)
        self._check(args2, kwargs2)
Example #16
0
class DownloaderSlotTest(unittest.TestCase):
    default_concurrency = 2
    default_delay = 0
    default_randomize_delay = False

    def setUp(self):
        self.handler = MockDownloaderHandler(Settings())
        self.clock = Clock()
        self.slot = Slot(self.handler,
                         self.default_concurrency,
                         self.default_delay,
                         self.default_randomize_delay,
                         clock=self.clock)

    def test_basic(self):
        received = []

        def downloaded(result):
            received.append(result)

        # enqueue 3 requests
        r1, dfd1 = get_request('1', func=downloaded)
        self.slot.enqueue(r1, dfd1)
        self.assertEqual(len(self.slot.in_progress), 1)
        self.assertEqual(len(self.slot.transferring), 1)
        r2, dfd2 = get_request('2', func=downloaded)
        r3, dfd3 = get_request('3', func=downloaded)
        self.slot.enqueue(r2, dfd2)
        self.slot.enqueue(r3, dfd3)
        self.assertEqual(len(self.slot.in_progress), 3)
        self.assertEqual(len(self.slot.transferring), 2)
        self.assertEqual(self.slot.free_slots, 0)

        # download r2
        self.handler.call(r2, Response(''))
        self.assertIs(received[-1].request, r2)
        self.assertEqual(len(self.slot.transferring), 2)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(self.slot.free_slots, 0)

        # download r1 and r3
        self.handler.call(r3, Response(''))
        self.handler.call(r1, Response(''))
        self.assertIs(received[-2].request, r3)
        self.assertIs(received[-1].request, r1)
        self.assertEqual(self.slot.free_slots, 2)

        # nothing happens now
        self.clock.advance(5)
        self.assertEqual(len(self.slot.in_progress), 0)
        self.assertEqual(self.slot.free_slots, 2)

    def test_delay(self):
        self.slot.concurrency = 1
        self.slot.delay = 5
        self.clock.advance(10)  # so we don't start on time 0

        # enqueue 3 requests
        r1, dfd1 = get_request('1')
        self.slot.enqueue(r1, dfd1)
        r2, dfd2 = get_request('2')
        self.slot.enqueue(r2, dfd2)
        r3, dfd3 = get_request('3')
        self.slot.enqueue(r3, dfd3)
        self.assertEqual(len(self.slot.in_progress), 3)
        self.assertEqual(len(self.slot.transferring), 1)
        self.assertEqual(self.slot.last_download_time, 10)
        self.assertEqual(self.slot.delayed_processing.get_time(), 15)

        # download the 1st request
        self.handler.call(r1, Response(''))
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 0)
        self.assertEqual(self.slot.free_slots, 1)
        # we should still wait
        self.clock.advance(3)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 0)
        self.assertEqual(self.slot.free_slots, 1)
        # make the 2nd request downloading
        self.clock.advance(3)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 1)
        self.assertEqual(self.slot.free_slots, 0)
        self.assertEqual(self.slot.last_download_time, 16)
        self.assertEqual(self.slot.delayed_processing.get_time(), 21)
        # wait and nothing happens
        self.clock.advance(10)
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 1)
        self.assertFalse(self.slot.delayed_processing.is_scheduled())

    def test_random_delay(self):
        self.slot.delay = 5
        self.slot.randomize_delay = False
        delays = [self.slot.get_download_delay() for x in xrange(10)]
        self.assertTrue(all(x == 5 for x in delays))

        self.slot.randomize_delay = True
        lower = 0.5 * 5
        upper = 1.5 * 5
        delays = [self.slot.get_download_delay() for x in xrange(100)]
        self.assertTrue(all(lower <= x <= upper for x in delays))

        third1 = (2 * lower + upper) / 3
        third2 = (lower + 2 * upper) / 3
        self.assertTrue(any(x <= third1 for x in delays))
        self.assertTrue(any(third1 <= x <= third2 for x in delays))
        self.assertTrue(any(third2 <= x for x in delays))

    def test_fail(self):
        received = []

        def downloaded(result):
            received.append(result)

        # enqueue 3 requests
        r1, dfd1 = get_request('1', func=downloaded)
        self.slot.enqueue(r1, dfd1)
        r2, dfd2 = get_request('2', func=downloaded)
        self.slot.enqueue(r2, dfd2)
        r3, dfd3 = get_request('3', func=downloaded)
        self.slot.enqueue(r3, dfd3)
        # fail the first request
        err = ValueError('my bad')
        self.handler.fail(r1, err)
        self.assertEqual(received[-1].value, err)
        # other requests should be ok
        self.assertEqual(len(self.slot.in_progress), 2)
        self.assertEqual(len(self.slot.transferring), 2)
        self.handler.call(r2, Response(''))
        self.assertEqual(received[-1].request, r2)
        self.handler.call(r3, Response(''))
        self.assertEqual(received[-1].request, r3)
        self.assertEqual(len(self.slot.in_progress), 0)
        self.assertEqual(len(self.slot.transferring), 0)

    def test_exception(self):
        self.slot.download_handler = ExceptionDownloaderHandler(Settings())
        r1, dfd1 = get_request('1')
        self.slot.enqueue(r1, dfd1)
        return self.assertFailure(dfd1, Exception)

    def test_failure(self):
        self.slot.download_handler = FailureDownloaderHandler(Settings())
        download_values = []

        def downloaded(value):
            download_values.append(value)

        for i in xrange(2):
            r, dfd = get_request(str(i))
            dfd.addBoth(downloaded)
            self.slot.enqueue(r, dfd)

        self.assertEqual(len(download_values), 2)
        self.assertIsInstance(download_values[0], Failure)
        self.assertIsInstance(download_values[1], Failure)
        self.assertIsNot(download_values[0], download_values[1])
        self.assertIsInstance(download_values[0].value, ValueError)
Example #17
0
class DownloaderTest(unittest.TestCase):

    default_settings = {
        'CONCURRENT_REQUESTS': 2,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 0,
        'RANDOMIZE_DOWNLOAD_DELAY': False
    }

    def setUp(self):
        self.clock = Clock()
        self.request_queue = MemoryQueue()
        self.response_queue = ResponseQueue()
        self.dwn = Downloader(Settings(self.default_settings),
                              self.request_queue,
                              self.response_queue,
                              download_handler=MockDownloaderHandler(
                                  Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def _update_dwn(self, **kwargs):
        '''Update downloader with the new settings.
        '''
        new_settings = self.default_settings.copy()
        new_settings.update(**kwargs)
        self.dwn.processing.cancel()
        self.dwn = Downloader(Settings(new_settings),
                              self.request_queue,
                              self.response_queue,
                              download_handler=MockDownloaderHandler(
                                  Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def test_concurrency(self):
        # standard situation
        self._update_dwn()
        self.assertEqual(self.dwn.total_concurrency, 2)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertTrue(self.dwn.use_domain_specific)
        # delay set
        self._update_dwn(CONCURRENT_REQUESTS=10,
                         CONCURRENT_REQUESTS_PER_DOMAIN=5,
                         DOWNLOAD_DELAY=3.14)
        self.assertEqual(self.dwn.download_delay, 3.14)
        self.assertEqual(self.dwn.total_concurrency, 1)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is 0
        self._update_dwn(CONCURRENT_REQUESTS=10,
                         CONCURRENT_REQUESTS_PER_DOMAIN=0)
        self.assertEqual(self.dwn.total_concurrency, 10)
        self.assertEqual(self.dwn.domain_concurrency, 10)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is too big
        self._update_dwn(CONCURRENT_REQUESTS=5,
                         CONCURRENT_REQUESTS_PER_DOMAIN=10)
        self.assertEqual(self.dwn.total_concurrency, 5)
        self.assertEqual(self.dwn.domain_concurrency, 5)
        self.assertFalse(self.dwn.use_domain_specific)
        self._update_dwn(CONCURRENT_REQUESTS=5,
                         CONCURRENT_REQUESTS_PER_DOMAIN=5)
        self.assertFalse(self.dwn.use_domain_specific)

    def test_get_slot(self):
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, 'www.github.com')
        key2, slot2 = self.dwn._get_slot(
            Request('http://www.github.com/hello/world#bla'))
        self.assertEqual(key2, 'www.github.com')
        self.assertIs(slot2, slot)
        key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertEqual(key3, 'sites.github.com')
        self.assertIsNot(slot3, slot)
        self.assertEqual(len(self.dwn.slots), 2)

        # don't use domain specific slots
        self.dwn.use_domain_specific = False
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, '')
        key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertIs(slot2, slot)

    def test_basic(self):
        # create 5 requests with slot ids: a, b, a, a, c
        requests = [get_request(id)[0] for id in 'abaac']
        map(lambda r: self.request_queue.push(r), requests)
        self.assertEqual(self.dwn.free_slots, 2)
        self.assertTrue(self.dwn.is_idle())

        # start downloading first two requests
        self.clock.advance(0)
        self.assertEqual(self.dwn.free_slots, 0)
        self.assertFalse(self.dwn.is_idle())
        # no more requests are scheduled, until download is finished
        self.clock.advance(20)
        self.assertEqual(len(self.request_queue), 3)
        # download the first request
        self.handler.call(requests[0], Response('hello'))
        self.assertEqual(self.dwn.free_slots,
                         1)  # slot is immediately available
        # result is also available
        result = self.response_queue.peek()
        self.assertIs(result.request, requests[0])
        self.assertEqual(result.url, 'hello')
        # enqueue third request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # download second request
        self.handler.call(requests[1], Response(''))
        # enqueue fourth request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # fourth request should not begin download, until 3rd request is done
        self.assertRaises(KeyError, self.handler.call, requests[3],
                          Response(''))
        # finish
        self.handler.call(requests[2], Response(''))
        self.handler.call(requests[3], Response(''))
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.handler.call(requests[4], Response(''))
        # final checks
        self.clock.pump([1] * 10)
        self.assertEqual(len(self.response_queue), 5)
        self.assertTrue(self.dwn.is_idle())

    def test_close(self):
        req1 = get_request('a')[0]
        req2 = get_request('b')[0]
        self.request_queue.push(req1)
        self.clock.advance(20)
        self.request_queue.push(req2)
        # test basic attributes, before and after closing
        self.assertTrue(self.dwn.running)
        self.assertTrue(self.dwn.processing.is_scheduled())
        self.dwn.close()
        self.assertFalse(self.dwn.running)
        self.assertFalse(self.dwn.processing.is_scheduled())

        self.clock.advance(20)
        self.assertEqual(len(self.request_queue),
                         1)  # request 2 remains unqueued

        # downloader behavior after closing
        self.assertEqual(len(self.response_queue), 0)
        self.handler.call(req1, Response(''))
        self.assertEqual(len(self.response_queue), 0)

    def test_fail(self):
        self._update_dwn(CONCURRENT_REQUESTS=3,
                         CONCURRENT_REQUESTS_PER_DOMAIN=2)
        requests = [get_request(id)[0] for id in 'aab']
        map(lambda r: self.request_queue.push(r), requests)

        # enqueue requests
        self.clock.advance(0)
        # fail 1st request
        err = ValueError('my bad')
        self.handler.fail(requests[0], err)
        self.assertEqual(self.dwn.free_slots, 1)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[0])
        self.assertIs(fail.value, err)
        # fail 3rd request
        self.handler.fail(requests[2], err)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[2])
        self.assertIs(fail.value, err)
        # succeed 2nd request
        self.handler.call(requests[1], Response('nice!', request=requests[1]))
        resp = self.response_queue.pop()
        self.assertIs(resp.request, requests[1])
        self.assertEqual(resp.url, 'nice!')

    def test_clear_slots(self):
        requests = [get_request(id)[0] for id in xrange(30)]
        for r in requests:
            self.request_queue.push(r)
            self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
            self.handler.call(r, Response(''))
        self.assertLessEqual(len(self.dwn.slots),
                             2 * self.dwn.total_concurrency)
Example #18
0
 def setUp(self):
     self.clock = Clock()
     self.obj = ModifiedObject()
     self.sc = ScheduledCall(self.obj.func, clock=self.clock,
                             *self.default_args,
                             **self.default_kwargs)
Example #19
0
class DownloaderTest(unittest.TestCase):

    default_settings = {
        'CONCURRENT_REQUESTS': 2,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 0,
        'RANDOMIZE_DOWNLOAD_DELAY': False}

    def setUp(self):
        self.clock = Clock()
        self.request_queue = MemoryQueue()
        self.response_queue = ResponseQueue()
        self.dwn = Downloader(Settings(self.default_settings), self.request_queue,
                              self.response_queue,
                              download_handler=MockDownloaderHandler(Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def _update_dwn(self, **kwargs):
        '''Update downloader with the new settings.
        '''
        new_settings = self.default_settings.copy()
        new_settings.update(**kwargs)
        self.dwn.processing.cancel()
        self.dwn = Downloader(Settings(new_settings), self.request_queue, self.response_queue,
                              download_handler=MockDownloaderHandler(Settings()),
                              clock=self.clock)
        self.handler = self.dwn.download_handler

    def test_concurrency(self):
        # standard situation
        self._update_dwn()
        self.assertEqual(self.dwn.total_concurrency, 2)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertTrue(self.dwn.use_domain_specific)
        # delay set
        self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=5,
                      DOWNLOAD_DELAY=5)
        self.assertEqual(self.dwn.total_concurrency, 1)
        self.assertEqual(self.dwn.domain_concurrency, 1)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is 0
        self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=0)
        self.assertEqual(self.dwn.total_concurrency, 10)
        self.assertEqual(self.dwn.domain_concurrency, 10)
        self.assertFalse(self.dwn.use_domain_specific)
        # domain concurrency is too big
        self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=10)
        self.assertEqual(self.dwn.total_concurrency, 5)
        self.assertEqual(self.dwn.domain_concurrency, 5)
        self.assertFalse(self.dwn.use_domain_specific)
        self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=5)
        self.assertFalse(self.dwn.use_domain_specific)

    def test_get_slot(self):
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, 'www.github.com')
        key2, slot2 = self.dwn._get_slot(Request('http://www.github.com/hello/world#bla'))
        self.assertEqual(key2, 'www.github.com')
        self.assertIs(slot2, slot)
        key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertEqual(key3, 'sites.github.com')
        self.assertIsNot(slot3, slot)
        self.assertEqual(len(self.dwn.slots), 2)

        # don't use domain specific slots
        self.dwn.use_domain_specific = False
        key, slot = self.dwn._get_slot(Request('http://www.github.com/'))
        self.assertEqual(key, '')
        key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/'))
        self.assertIs(slot2, slot)

    def test_basic(self):
        # create 5 requests with slot ids: a, b, a, a, c
        requests = [get_request(id)[0] for id in 'abaac']
        map(lambda r: self.request_queue.push(r), requests)
        self.assertEqual(self.dwn.free_slots, 2)
        self.assertTrue(self.dwn.is_idle())

        # start downloading first two requests
        self.clock.advance(0)
        self.assertEqual(self.dwn.free_slots, 0)
        self.assertFalse(self.dwn.is_idle())
        # no more requests are scheduled, until download is finished
        self.clock.advance(20)
        self.assertEqual(len(self.request_queue), 3)
        # download the first request
        self.handler.call(requests[0], Response('hello'))
        self.assertEqual(self.dwn.free_slots, 1)  # slot is immediately available
        # result is also available
        result = self.response_queue.peek()
        self.assertIs(result.request, requests[0])
        self.assertEqual(result.url, 'hello')
        # enqueue third request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # download second request
        self.handler.call(requests[1], Response(''))
        # enqueue fourth request
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.assertEqual(self.dwn.free_slots, 0)
        # fourth request should not begin download, until 3rd request is done
        self.assertRaises(KeyError, self.handler.call, requests[3], Response(''))
        # finish
        self.handler.call(requests[2], Response(''))
        self.handler.call(requests[3], Response(''))
        self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
        self.handler.call(requests[4], Response(''))
        # final checks
        self.clock.pump([1] * 10)
        self.assertEqual(len(self.response_queue), 5)
        self.assertTrue(self.dwn.is_idle())

    def test_close(self):
        req1 = get_request('a')[0]
        req2 = get_request('b')[0]
        self.request_queue.push(req1)
        self.clock.advance(20)
        self.request_queue.push(req2)
        # test basic attributes, before and after closing
        self.assertTrue(self.dwn.running)
        self.assertTrue(self.dwn.processing.is_scheduled())
        self.dwn.close()
        self.assertFalse(self.dwn.running)
        self.assertFalse(self.dwn.processing.is_scheduled())

        self.clock.advance(20)
        self.assertEqual(len(self.request_queue), 1)  # request 2 remains unqueued

        # downloader behavior after closing
        self.assertEqual(len(self.response_queue), 0)
        self.handler.call(req1, Response(''))
        self.assertEqual(len(self.response_queue), 0)

    def test_fail(self):
        self._update_dwn(CONCURRENT_REQUESTS=3, CONCURRENT_REQUESTS_PER_DOMAIN=2)
        requests = [get_request(id)[0] for id in 'aab']
        map(lambda r: self.request_queue.push(r), requests)

        # enqueue requests
        self.clock.advance(0)
        # fail 1st request
        err = ValueError('my bad')
        self.handler.fail(requests[0], err)
        self.assertEqual(self.dwn.free_slots, 1)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[0])
        self.assertIs(fail.value, err)
        # fail 3rd request
        self.handler.fail(requests[2], err)
        fail = self.response_queue.pop()
        self.assertIs(fail.request, requests[2])
        self.assertIs(fail.value, err)
        # succeed 2nd request
        self.handler.call(requests[1], Response('nice!', request=requests[1]))
        resp = self.response_queue.pop()
        self.assertIs(resp.request, requests[1])
        self.assertEqual(resp.url, 'nice!')

    def test_clear_slots(self):
        requests = [get_request(id)[0] for id in xrange(30)]
        for r in requests:
            self.request_queue.push(r)
            self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY)
            self.handler.call(r, Response(''))
        self.assertLessEqual(len(self.dwn.slots), 2 * self.dwn.total_concurrency)