def setUp(self): self.clock = Clock() self.obj = ModifiedObject() self.sc = ScheduledCall(self.obj.func, clock=self.clock, *self.default_args, **self.default_kwargs)
def __init__(self, download_handler, concurrency, delay, randomize_delay, clock=None): self.download_handler = download_handler self.concurrency = concurrency self.delay = delay self.randomize_delay = randomize_delay self.in_progress = set() # request waiting to be downloaded self.transferring = set() # requests being downloaded (subset of `in_progress`) self.last_download_time = 0 self.queue = MemoryQueue() # queue of (request, deferred) # clock is used in unittests self.clock = clock or reactor self.delayed_processing = ScheduledCall(self._process, clock=self.clock)
def __init__(self, settings, project, command_invoked='', clock=None): '''Constructor of Engine should be very lightweight, so that things can be easily unittested. For any more complicated initialization use `setup()`. ''' self.settings = settings self.project = project self.spiders = SpiderManager(settings) self.stop_if_idle = True self.initialized = False # True, when `setup()` has been called # name of the command invoking the engine. E.g. `crawl`, `shell`, etc. self.command_invoked = command_invoked self.spider = None self.pending_requests = 0 self.running = False self.paused = False # clock is used in unittests self.clock = clock or reactor self.processing = ScheduledCall(self._process_queue, clock=self.clock)
def test_init(self): # test initializing ScheduledCall without overriding its clock sc = ScheduledCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel()
class ScheduledCallTest(unittest.TestCase): default_args = (10, 'hello') default_kwargs = {'a': 47, 'b': 'c'} def setUp(self): self.clock = Clock() self.obj = ModifiedObject() self.sc = ScheduledCall(self.obj.func, clock=self.clock, *self.default_args, **self.default_kwargs) def _check(self, args, kwargs): if args is None: self.assertIsNone(self.obj.args) else: self.assertTupleEqual(self.obj.args, args) if kwargs is None: self.assertIsNone(self.obj.kwargs) else: self.assertEqual(self.obj.kwargs, kwargs) def test_init(self): # test initializing ScheduledCall without overriding its clock sc = ScheduledCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel() def test_get_time_and_is_scheduled(self): self.clock.advance(10) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) self.sc.schedule(5) self.assertTrue(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 15) self.clock.advance(5) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) def test_no_delay(self): self.sc.schedule() self._check(None, None) self.clock.advance(0) self._check(self.default_args, self.default_kwargs) def test_default(self): self.assertTrue(self.sc.schedule(5)) self._check(None, None) self.clock.advance(1) self.assertFalse(self.sc.schedule(1)) self.clock.advance(2) self._check(None, None) self.clock.advance(3) self._check(self.default_args, self.default_kwargs) def test_cancel(self): self.sc.schedule(5) self.clock.advance(3) self.sc.cancel() self.clock.advance(3) self._check(None, None) self.assertTrue(self.sc.schedule(1)) self.clock.advance(1) self._check(self.default_args, self.default_kwargs) def test_overwrite(self): over_args = ('crawlmi',) over_kwargs = {'a': 50, 'd': 'e'} self.sc.schedule(5, *over_args, **over_kwargs) self.clock.advance(5) self._check(over_args, over_kwargs) def test_partial_overwrite(self): over_args = ('crawlmi',) self.sc.schedule(5, *over_args) self.clock.advance(5) self._check(over_args, {}) def test_nested_schedule(self): def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule() self.sc.func = func self.sc.schedule() self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 2) def test_nested_schedule_delay(self): args1 = ('a',) kwargs1 = {'a': 'b'} args2 = ('b',) kwargs2 = {'b': 'c'} def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule(4, *args2, **kwargs2) self.sc.func = func self.sc.schedule(3, *args1, **kwargs1) self.clock.advance(3) self.assertIsNotNone(self.sc._call) self._check(args1, kwargs1) self.clock.advance(3) self._check(args1, kwargs1) self.clock.advance(1) self._check(args2, kwargs2)
class ScheduledCallTest(unittest.TestCase): default_args = (10, 'hello') default_kwargs = {'a': 47, 'b': 'c'} def setUp(self): self.clock = Clock() self.obj = ModifiedObject() self.sc = ScheduledCall(self.obj.func, clock=self.clock, *self.default_args, **self.default_kwargs) def _check(self, args, kwargs): if args is None: self.assertIsNone(self.obj.args) else: self.assertTupleEqual(self.obj.args, args) if kwargs is None: self.assertIsNone(self.obj.kwargs) else: self.assertEqual(self.obj.kwargs, kwargs) def test_init(self): # test initializing ScheduledCall without overriding its clock sc = ScheduledCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel() def test_get_time_and_is_scheduled(self): self.clock.advance(10) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) self.sc.schedule(5) self.assertTrue(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 15) self.clock.advance(5) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) def test_no_delay(self): self.sc.schedule() self._check(None, None) self.clock.advance(0) self._check(self.default_args, self.default_kwargs) def test_default(self): self.assertTrue(self.sc.schedule(5)) self._check(None, None) self.clock.advance(1) self.assertFalse(self.sc.schedule(1)) self.clock.advance(2) self._check(None, None) self.clock.advance(3) self._check(self.default_args, self.default_kwargs) def test_cancel(self): self.sc.schedule(5) self.clock.advance(3) self.sc.cancel() self.clock.advance(3) self._check(None, None) self.assertTrue(self.sc.schedule(1)) self.clock.advance(1) self._check(self.default_args, self.default_kwargs) def test_overwrite(self): over_args = ('crawlmi', ) over_kwargs = {'a': 50, 'd': 'e'} self.sc.schedule(5, *over_args, **over_kwargs) self.clock.advance(5) self._check(over_args, over_kwargs) def test_partial_overwrite(self): over_args = ('crawlmi', ) self.sc.schedule(5, *over_args) self.clock.advance(5) self._check(over_args, {}) def test_nested_schedule(self): def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule() self.sc.func = func self.sc.schedule() self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 2) def test_nested_schedule_delay(self): args1 = ('a', ) kwargs1 = {'a': 'b'} args2 = ('b', ) kwargs2 = {'b': 'c'} def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule(4, *args2, **kwargs2) self.sc.func = func self.sc.schedule(3, *args1, **kwargs1) self.clock.advance(3) self.assertIsNotNone(self.sc._call) self._check(args1, kwargs1) self.clock.advance(3) self._check(args1, kwargs1) self.clock.advance(1) self._check(args2, kwargs2)
class Slot(object): '''Slot represents a queue of requests for one particular domain. It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN. ''' def __init__(self, download_handler, concurrency, delay, randomize_delay, clock=None): self.download_handler = download_handler self.concurrency = concurrency self.delay = delay self.randomize_delay = randomize_delay self.in_progress = set() # request waiting to be downloaded self.transferring = set() # requests being downloaded (subset of `in_progress`) self.last_download_time = 0 self.queue = MemoryQueue() # queue of (request, deferred) # clock is used in unittests self.clock = clock or reactor self.delayed_processing = ScheduledCall(self._process, clock=self.clock) def enqueue(self, request, dfd): '''Main entry point. Put the new request to the queue and if possible, start downloading it. ''' def remove_in_progress(response): self.in_progress.remove(request) return response self.in_progress.add(request) dfd.addBoth(remove_in_progress) self.queue.push((request, dfd)) self._process() @property def free_slots(self): return self.concurrency - len(self.transferring) def is_idle(self): return len(self.in_progress) == 0 def _process(self): '''Process the requests in the queue, while respecting the delay and concurrency. ''' if self.delayed_processing.is_scheduled() or self._schedule_delay(): return while self.queue and self.free_slots > 0: self.last_download_time = self.clock.seconds() request, downloaded_dfd = self.queue.pop() dfd = self._download(request) dfd.chainDeferred(downloaded_dfd) if self._schedule_delay(): return def _schedule_delay(self): if self.delay: penalty = (self.last_download_time + self.get_download_delay() - self.clock.seconds()) if penalty > 0: # following schedule should always be successfull, because # `_schedule_delay()` is only called from within `_process()` self.delayed_processing.schedule(penalty) return True return False def _download(self, request): dfd = defer.succeed(request) # download the response dfd.addCallback(self.download_handler.download_request) # it is VERY important to wrap the failure into a new object! # For errors like ConnectionLost, the same Failure object is returned # everytime and we cannot use 'failure.request' field. def wrap_failure(failure): return Failure(failure.value) dfd.addErrback(wrap_failure) # put the request into the set of `transferring` to block other requests # after the response is downloaded, remove it from `transferring` def remove_transferring(response): self.transferring.remove(request) self._process() # process unblocked requests return response self.transferring.add(request) dfd.addBoth(remove_transferring) return dfd def get_download_delay(self): if self.randomize_delay: return random.uniform(0.5 * self.delay, 1.5 * self.delay) return self.delay
class Engine(object): ''' WARNING: don't stop() and start() engine. Use pause() and unpause(), instead. ''' # how many seconds to wait between the checks of response_queue QUEUE_CHECK_FREQUENCY = 0.1 # how often to check is still paused PAUSED_CHECK_FREQUENCY = 5 # how often to check if being idle IDLE_CHECK_FREQUENCY = 5 def __init__(self, settings, project, command_invoked='', clock=None): '''Constructor of Engine should be very lightweight, so that things can be easily unittested. For any more complicated initialization use `setup()`. ''' self.settings = settings self.project = project self.spiders = SpiderManager(settings) self.stop_if_idle = True self.initialized = False # True, when `setup()` has been called # name of the command invoking the engine. E.g. `crawl`, `shell`, etc. self.command_invoked = command_invoked self.spider = None self.pending_requests = 0 self.running = False self.paused = False # clock is used in unittests self.clock = clock or reactor self.processing = ScheduledCall(self._process_queue, clock=self.clock) def set_spider(self, spider): self.spider = spider self.settings.spider_settings = spider.spider_settings() def setup(self): assert self.spider is not None, 'Spider is not set in Engine.' # IMPORTANT: order of the following initializations is very important # so please, think twice about any changes to it # initialize logging if self.settings.get_bool('LOG_ENABLED'): log.start( self.settings['LOG_FILE'], self.settings['LOG_LEVEL'], self.settings['LOG_STDOUT'], self.settings['LOG_ENCODING']) # initialize signals self.signals = SignalManager(self) #initialize stats stats_cls = load_object(self.settings.get('STATS_CLASS')) self.stats = stats_cls(self) # initialize downloader self.request_queue = PriorityQueue(lambda _: MemoryQueue()) self.response_queue = ResponseQueue( self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT')) self.downloader = Downloader(self.settings, self.request_queue, self.response_queue, clock=self.clock) # initialize extensions self.extensions = ExtensionManager(self) # initialize downloader pipeline self.pipeline = PipelineManager(self) self.initialized = True # now that everything is ready, set the spider's engine self.spider.set_engine(self) def crawl_start_requests(self): # process start requests from spider try: requests = self.spider.start_requests() for req in arg_to_iter(requests): self.download(req) except: log.err(Failure(), 'Error when processing start requests.') def start(self): assert self.initialized, 'Engine is not initialized. Call `setup()` to initialize it.' self.start_time = time() self.running = True self.signals.send(signal=signals.engine_started) self.processing.schedule(self.QUEUE_CHECK_FREQUENCY) def stop(self, reason=''): assert self.running, 'Engine is not running.' self.running = False def _stop(_): self.processing.cancel() self.downloader.close() self.request_queue.close() self.response_queue.close() log.msg(format='Engine stopped (%(reason)s)', reason=reason) self.signals.send(signal=signals.engine_stopped, reason=reason) self.stats.dump_stats() dfd = defer_succeed(reason, clock=self.clock) dfd.addBoth(_stop) return dfd def pause(self): self.paused = True def unpause(self): self.paused = False def download(self, request): '''"Download" the given request. First pass it through the downloader pipeline. - if the request is received, push it to `request_queue` - if the response is received , push it to `response_queue` ''' def _success(request_or_response): if isinstance(request_or_response, Request): self.signals.send(signal=signals.request_received, request=request_or_response) if self.running: self.request_queue.push(request_or_response.priority, request_or_response) elif isinstance(request_or_response, Response): request_or_response.request = request if self.running: self.response_queue.push(request_or_response) def _failure(failure): failure.request = request dfd = defer_fail(failure, clock=self.clock) dfd.addBoth(self._handle_pipeline_result) dfd.addBoth(self._finalize_download) return dfd self.pending_requests += 1 d = defer_succeed(request, clock=self.clock) d.addCallback(self.pipeline.process_request) d.addCallbacks(_success, _failure) return d def is_idle(self): return self.pending_requests == 0 and len(self.response_queue) == 0 def _process_queue(self): if not self.running: return elif self.paused: self.processing.schedule(self.PAUSED_CHECK_FREQUENCY) elif self.response_queue: response = self.response_queue.pop() if isinstance(response, Response): self.signals.send(signal=signals.response_downloaded, response=response) dfd = defer_result(response, clock=self.clock) dfd.addBoth(self.pipeline.process_response) dfd.addBoth(self._handle_pipeline_result) dfd.addBoth(self._finalize_download) dfd.addBoth(lambda _: self.processing.schedule(0)) elif self.is_idle(): # send `spider_idle` signal res = self.signals.send(signal=signals.spider_idle, dont_log=DontStopEngine) dont_stop = any(isinstance(x, Failure) and isinstance(x.value, DontStopEngine) for _, x in res) # more requests have been scheduled if not self.is_idle(): self.processing.schedule(0) # slow down a little, but still run elif dont_stop or not self.stop_if_idle: self.processing.schedule(self.IDLE_CHECK_FREQUENCY) else: self.stop('finished') else: self.processing.schedule(self.QUEUE_CHECK_FREQUENCY) def _finalize_download(self, _): self.pending_requests -= 1 def _handle_pipeline_result(self, result): if result is None: pass elif isinstance(result, Request): self.download(result) else: assert isinstance(result, (Response, Failure)) request = result.request if isinstance(result, Response): flags = ' %s' % result.flags if result.flags else '' log.msg(format='Crawled %(url)s [%(status)s]%(flags)s', level=log.DEBUG, url=result.url, status=result.status, flags=flags) self.signals.send(signal=signals.response_received, response=result) else: self.signals.send(signal=signals.failure_received, failure=result) dfd = defer_result(result, clock=self.clock) dfd.addCallbacks(request.callback or self.spider.parse, request.errback) dfd.addCallbacks( self._handle_spider_output, self._handle_spider_error, callbackKeywords={'request': request}, errbackKeywords={'request': request}) return dfd def _handle_spider_output(self, result, request): result = arg_to_iter(result) for request in result: assert isinstance(request, Request), \ 'spider must return None, request or iterable of requests' self.download(request) def _handle_spider_error(self, failure, request): error = failure.value if isinstance(error, StopEngine): self.stop(error.reason) return # set `request` in a case the error was raised inside the spider failure.request = request self.signals.send(signal=signals.spider_error, failure=failure) if not getattr(failure.value, 'quiet', False): log.err(failure, 'Error when downloading %s' % request) def __str__(self): return '<%s at 0x%0x>' % (type(self).__name__, id(self)) __repr__ = __str__
class Engine(object): ''' WARNING: don't stop() and start() engine. Use pause() and unpause(), instead. ''' # how many seconds to wait between the checks of response_queue QUEUE_CHECK_FREQUENCY = 0.1 # how often to check is still paused PAUSED_CHECK_FREQUENCY = 5 # how often to check if being idle IDLE_CHECK_FREQUENCY = 5 def __init__(self, settings, project, command_invoked='', clock=None): '''Constructor of Engine should be very lightweight, so that things can be easily unittested. For any more complicated initialization use `setup()`. ''' self.settings = settings self.project = project self.spiders = SpiderManager(settings) self.stop_if_idle = True self.initialized = False # True, when `setup()` has been called # name of the command invoking the engine. E.g. `crawl`, `shell`, etc. self.command_invoked = command_invoked self.spider = None self.pending_requests = 0 self.running = False self.paused = False # clock is used in unittests self.clock = clock or reactor self.processing = ScheduledCall(self._process_queue, clock=self.clock) def set_spider(self, spider): self.spider = spider self.settings.spider_settings = spider.spider_settings() def setup(self): assert self.spider is not None, 'Spider is not set in Engine.' # IMPORTANT: order of the following initializations is very important # so please, think twice about any changes to it # initialize logging if self.settings.get_bool('LOG_ENABLED'): log.start(self.settings['LOG_FILE'], self.settings['LOG_LEVEL'], self.settings['LOG_STDOUT'], self.settings['LOG_ENCODING']) # initialize signals self.signals = SignalManager(self) #initialize stats stats_cls = load_object(self.settings.get('STATS_CLASS')) self.stats = stats_cls(self) # initialize downloader self.request_queue = PriorityQueue(lambda _: MemoryQueue()) self.response_queue = ResponseQueue( self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT')) self.downloader = Downloader(self.settings, self.request_queue, self.response_queue, clock=self.clock) # initialize extensions self.extensions = ExtensionManager(self) # initialize downloader pipeline self.pipeline = PipelineManager(self) self.initialized = True # now that everything is ready, set the spider's engine self.spider.set_engine(self) def crawl_start_requests(self): # process start requests from spider try: requests = self.spider.start_requests() for req in arg_to_iter(requests): self.download(req) except: log.err(Failure(), 'Error when processing start requests.') def start(self): assert self.initialized, 'Engine is not initialized. Call `setup()` to initialize it.' self.start_time = time() self.running = True self.signals.send(signal=signals.engine_started) self.processing.schedule(self.QUEUE_CHECK_FREQUENCY) def stop(self, reason=''): assert self.running, 'Engine is not running.' self.running = False def _stop(_): self.processing.cancel() self.downloader.close() self.request_queue.close() self.response_queue.close() log.msg(format='Engine stopped (%(reason)s)', reason=reason) self.signals.send(signal=signals.engine_stopped, reason=reason) self.stats.dump_stats() dfd = defer_succeed(reason, clock=self.clock) dfd.addBoth(_stop) return dfd def pause(self): self.paused = True def unpause(self): self.paused = False def download(self, request): '''"Download" the given request. First pass it through the downloader pipeline. - if the request is received, push it to `request_queue` - if the response is received , push it to `response_queue` ''' def _success(request_or_response): if isinstance(request_or_response, Request): self.signals.send(signal=signals.request_received, request=request_or_response) if self.running: self.request_queue.push(request_or_response.priority, request_or_response) elif isinstance(request_or_response, Response): request_or_response.request = request if self.running: self.response_queue.push(request_or_response) def _failure(failure): failure.request = request dfd = defer_fail(failure, clock=self.clock) dfd.addBoth(self._handle_pipeline_result) dfd.addBoth(self._finalize_download) return dfd self.pending_requests += 1 d = defer_succeed(request, clock=self.clock) d.addCallback(self.pipeline.process_request) d.addCallbacks(_success, _failure) return d def is_idle(self): return self.pending_requests == 0 and len(self.response_queue) == 0 def _process_queue(self): if not self.running: return elif self.paused: self.processing.schedule(self.PAUSED_CHECK_FREQUENCY) elif self.response_queue: response = self.response_queue.pop() if isinstance(response, Response): self.signals.send(signal=signals.response_downloaded, response=response) dfd = defer_result(response, clock=self.clock) dfd.addBoth(self.pipeline.process_response) dfd.addBoth(self._handle_pipeline_result) dfd.addBoth(self._finalize_download) dfd.addBoth(lambda _: self.processing.schedule(0)) elif self.is_idle(): # send `spider_idle` signal res = self.signals.send(signal=signals.spider_idle, dont_log=DontStopEngine) dont_stop = any( isinstance(x, Failure) and isinstance(x.value, DontStopEngine) for _, x in res) # more requests have been scheduled if not self.is_idle(): self.processing.schedule(0) # slow down a little, but still run elif dont_stop or not self.stop_if_idle: self.processing.schedule(self.IDLE_CHECK_FREQUENCY) else: self.stop('finished') else: self.processing.schedule(self.QUEUE_CHECK_FREQUENCY) def _finalize_download(self, _): self.pending_requests -= 1 def _handle_pipeline_result(self, result): if result is None: pass elif isinstance(result, Request): self.download(result) else: assert isinstance(result, (Response, Failure)) request = result.request if isinstance(result, Response): flags = ' %s' % result.flags if result.flags else '' log.msg(format='Crawled %(url)s [%(status)s]%(flags)s', level=log.DEBUG, url=result.url, status=result.status, flags=flags) self.signals.send(signal=signals.response_received, response=result) else: self.signals.send(signal=signals.failure_received, failure=result) dfd = defer_result(result, clock=self.clock) dfd.addCallbacks(request.callback or self.spider.parse, request.errback) dfd.addCallbacks(self._handle_spider_output, self._handle_spider_error, callbackKeywords={'request': request}, errbackKeywords={'request': request}) return dfd def _handle_spider_output(self, result, request): result = arg_to_iter(result) for request in result: assert isinstance(request, Request), \ 'spider must return None, request or iterable of requests' self.download(request) def _handle_spider_error(self, failure, request): error = failure.value if isinstance(error, StopEngine): self.stop(error.reason) return # set `request` in a case the error was raised inside the spider failure.request = request self.signals.send(signal=signals.spider_error, failure=failure) if not getattr(failure.value, 'quiet', False): log.err(failure, 'Error when downloading %s' % request) def __str__(self): return '<%s at 0x%0x>' % (type(self).__name__, id(self)) __repr__ = __str__