def test_adjustable_semaphore(self): semaphore = AdjustableSemaphore(value=2) yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) # value = 1 yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) # value = 2 try: yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.set_max(3) self.assertEqual(3, semaphore.max) yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) # value = 3 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.set_max(1) self.assertEqual(1, semaphore.max) try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.release() # value = 2 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.release() # value = 1 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.release() # value = 0 yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) semaphore.release() self.assertRaises(ValueError, semaphore.release) def set_neg_max(): semaphore.set_max(-1) self.assertRaises(ValueError, set_neg_max)
class Engine(HookableMixin): '''Manages and processes item. Args: url_table (:class:`.database.BaseURLTable`): A table of URLs to be processed. processor (:class:`.processor.BaseProcessor`): A processor that will do things to finish an item. statistics (:class:`.stats.Statistics`): Information needed to compute the exit status. concurrent (int): The number of items to process at once. The engine is described like the following: 1. Get an "todo" item from the table. If none, skip to step 4. 2. Ask the processor to process the item. 3. Go to step 1. 4. Get an "error" item from the table. If none, skip to step 7. 5. Ask the processor to process the item. 6. Go to step 4. 7. Stop. In the context of Wpull, URLs are the central part of items. ''' def __init__(self, url_table, processor, statistics, concurrent=1): super().__init__() self._url_table = url_table self._processor = processor self._statistics = statistics self._worker_semaphore = AdjustableSemaphore(concurrent) self._done_event = toro.Event() self._num_worker_busy = 0 self._stopping = False self._worker_error = None self.register_hook('engine_run') @property def concurrent(self): '''The concurrency value.''' return self._worker_semaphore.max def set_concurrent(self, value): self._worker_semaphore.set_max(value) @tornado.gen.coroutine def __call__(self): '''Run the engine. This function will clear any items marked as in-progress, start up the workers, and loop until a stop is requested. Returns: int: An integer describing the exit status. .. seealso:: :class:`.errors.ExitStatus` ''' try: self.call_hook('engine_run') except HookDisconnected: pass self._release_in_progress() self._run_workers() yield self._done_event.wait() if self._worker_error: raise self._worker_error from self._worker_error def _release_in_progress(self): '''Release any items in progress.''' _logger.debug('Release in-progress.') self._url_table.release() @tornado.gen.coroutine def _run_workers(self): '''Start the worker tasks.''' while not self._stopping: yield self._worker_semaphore.acquire() tornado.ioloop.IOLoop.current().add_future( self._process_input(), lambda future: future.result(), ) def _get_next_url_record(self): '''Return the next available URL from the URL table. This function will return items marked as "todo" and then items marked as "error". As a consequence, items experiencing errors will be done last. Returns: :class:`.item.URLRecord`. ''' _logger.debug('Get next URL todo.') try: url_record = self._url_table.get_and_update( Status.todo, new_status=Status.in_progress) except NotFound: url_record = None if not url_record: try: _logger.debug('Get next URL error.') url_record = self._url_table.get_and_update( Status.error, new_status=Status.in_progress) except NotFound: url_record = None _logger.debug(__('Return record {0}.', url_record)) return url_record @tornado.gen.coroutine def _process_input(self): '''Get an item and process it. If processing an item encounters an error, :func:`stop` is called. Contract: This function will release the ``_worker_semaphore``. ''' try: while True: # Poll for an item if not self._stopping: url_record = self._get_next_url_record() else: url_record = None if not url_record: # TODO: need better check if we are done if self._num_worker_busy == 0: self.stop(force=True) self._worker_semaphore.release() return yield wpull.async.sleep(1.0) else: break self._num_worker_busy += 1 url_encoding = url_record.url_encoding or 'utf8' url_info = URLInfo.parse(url_record.url, encoding=url_encoding) url_item = URLItem(self._url_table, url_info, url_record) yield self._process_url_item(url_item) assert url_item.is_processed self._statistics.mark_done(url_info) except Exception as error: _logger.debug('Worker died from error.') self.stop(force=True) self._worker_error = error if self._statistics.is_quota_exceeded: _logger.debug('Stopping due to quota.') self.stop() self._num_worker_busy -= 1 self._worker_semaphore.release() @tornado.gen.coroutine def _process_url_item(self, url_item): '''Process an item. Args: url_item (:class:`.item.URLItem`): The item to process. This function calls :meth:`.processor.BaseProcessor.process`. ''' _logger.debug(__('Begin session for {0} {1}.', url_item.url_record, url_item.url_info)) yield self._processor.process(url_item) _logger.debug(__('End session for {0} {1}.', url_item.url_record, url_item.url_info)) def stop(self, force=False): '''Stop the engine. Args: force (bool): If ``True``, don't wait for Sessions to finish and stop the Engine immediately. If ``False``, the Engine will wait for all workers to finish. ''' _logger.debug(__('Stopping. force={0}', force)) self._stopping = True if force: self._done_event.set()
class Engine(object): '''Manages and processes item. Args: url_table (:class:`.database.BaseURLTable`): A table of URLs to be processed. processor (:class:`.processor.BaseProcessor`): A processor that will do things to finish an item. statistics (:class:`.stats.Statistics`): Information needed to compute the exit status. concurrent (int): The number of items to process at once. The engine is described like the following: 1. Get an "todo" item from the table. If none, skip to step 4. 2. Ask the processor to process the item. 3. Go to step 1. 4. Get an "error" item from the table. If none, skip to step 7. 5. Ask the processor to process the item. 6. Go to step 4. 7. Stop. In the context of Wpull, URLs are the central part of items. ''' ERROR_CODE_MAP = OrderedDict([ (ServerError, ExitStatus.server_error), (ProtocolError, ExitStatus.protocol_error), (SSLVerficationError, ExitStatus.ssl_verification_error), (DNSNotFound, ExitStatus.network_failure), (ConnectionRefused, ExitStatus.network_failure), (NetworkError, ExitStatus.network_failure), (OSError, ExitStatus.file_io_error), (IOError, ExitStatus.file_io_error), (ValueError, ExitStatus.parser_error), ]) '''Mapping of error types to exit status.''' def __init__(self, url_table, processor, statistics, concurrent=1): self._url_table = url_table self._processor = processor self._statistics = statistics self._worker_semaphore = AdjustableSemaphore(concurrent) self._done_event = toro.Event() self._num_worker_busy = 0 self._exit_code = 0 self._stopping = False self.stop_event = wpull.actor.Event() @property def concurrent(self): '''The concurrency value.''' return self._worker_semaphore.max def set_concurrent(self, value): self._worker_semaphore.set_max(value) @tornado.gen.coroutine def __call__(self): '''Run the engine. This function will clear any items marked as in-progress, start up the workers, and loop until a stop is requested. Returns: int: An integer describing the exit status. .. seealso:: :class:`.errors.ExitStatus` ''' self._statistics.start() self._release_in_progress() self._run_workers() yield self._done_event.wait() self._compute_exit_code_from_stats() if self._exit_code == ExitStatus.ssl_verification_error: self._print_ssl_error() self._statistics.stop() self._print_stats() self._processor.close() self._url_table.close() self.stop_event.fire() raise tornado.gen.Return(self._exit_code) def _release_in_progress(self): '''Release any items in progress.''' _logger.debug('Release in-progress.') self._url_table.release() @tornado.gen.coroutine def _run_workers(self): '''Start the worker tasks.''' while not self._stopping: yield self._worker_semaphore.acquire() tornado.ioloop.IOLoop.current().add_future( self._process_input(), lambda future: future.result(), ) def _get_next_url_record(self): '''Return the next available URL from the URL table. This function will return items marked as "todo" and then items marked as "error". As a consequence, items experiencing errors will be done last. Returns: :class:`.item.URLRecord`. ''' _logger.debug('Get next URL todo.') try: url_record = self._url_table.get_and_update( Status.todo, new_status=Status.in_progress) except NotFound: url_record = None if not url_record: try: _logger.debug('Get next URL error.') url_record = self._url_table.get_and_update( Status.error, new_status=Status.in_progress) except NotFound: url_record = None _logger.debug('Return record {0}.'.format(url_record)) return url_record @tornado.gen.coroutine def _process_input(self): '''Get an item and process it. If processing an item encounters an error, :func:`stop` is called. Contract: This function will release the ``_worker_semaphore``. ''' try: while True: # Poll for an item if not self._stopping: url_record = self._get_next_url_record() else: url_record = None if not url_record: # TODO: need better check if we are done if self._num_worker_busy == 0: self.stop(force=True) self._worker_semaphore.release() return yield wpull.async.sleep(1.0) else: break self._num_worker_busy += 1 url_encoding = url_record.url_encoding or 'utf8' url_info = URLInfo.parse(url_record.url, encoding=url_encoding) url_item = URLItem(self._url_table, url_info, url_record) yield self._process_url_item(url_item) assert url_item.is_processed self._statistics.mark_done(url_info) except Exception as error: _logger.exception('Fatal exception.') self._update_exit_code_from_error(error) self.stop(force=True) if self._statistics.is_quota_exceeded: _logger.debug('Stopping due to quota.') self.stop() self._num_worker_busy -= 1 self._worker_semaphore.release() @tornado.gen.coroutine def _process_url_item(self, url_item): '''Process an item. Args: url_item (:class:`.item.URLItem`): The item to process. This function calls :meth:`.processor.BaseProcessor.process`. ''' _logger.debug('Begin session for {0} {1}.'.format( url_item.url_record, url_item.url_info)) yield self._processor.process(url_item) _logger.debug('End session for {0} {1}.'.format( url_item.url_record, url_item.url_info)) def stop(self, force=False): '''Stop the engine. Args: force (bool): If ``True``, don't wait for Sessions to finish and stop the Engine immediately. If ``False``, the Engine will wait for all workers to finish. ''' _logger.debug('Stopping. force={0}'.format(force)) self._stopping = True if force: self._done_event.set() def _update_exit_code_from_error(self, error): '''Set the exit code based on the error type. Args: error (:class:`Exception`): An exception instance. ''' for error_type, exit_code in self.ERROR_CODE_MAP.items(): if isinstance(error, error_type): self._update_exit_code(exit_code) break else: self._update_exit_code(ExitStatus.generic_error) def _update_exit_code(self, code): '''Set the exit code if it is serious than before. Args: code (int): The exit code. ''' if code: if self._exit_code: self._exit_code = min(self._exit_code, code) else: self._exit_code = code def _compute_exit_code_from_stats(self): '''Set the current exit code based on the Statistics.''' for error_type in self._statistics.errors: exit_code = self.ERROR_CODE_MAP.get(error_type) if exit_code: self._update_exit_code(exit_code) def _print_stats(self): '''Log the final statistics to the user.''' stats = self._statistics time_length = datetime.timedelta( seconds=int(stats.stop_time - stats.start_time) ) file_size = wpull.string.format_size(stats.size) if stats.bandwidth_meter.num_samples: speed_size_str = wpull.string.format_size( stats.bandwidth_meter.speed() ) else: speed_size_str = _('-- B') _logger.info(_('FINISHED.')) _logger.info( _( 'Duration: {preformatted_timedelta}. ' 'Speed: {preformatted_speed_size}/s.' ).format( preformatted_timedelta=time_length, preformatted_speed_size=speed_size_str, ) ) _logger.info( gettext.ngettext( 'Downloaded: {num_files} file, {preformatted_file_size}.', 'Downloaded: {num_files} files, {preformatted_file_size}.', stats.files ).format( num_files=stats.files, preformatted_file_size=file_size ) ) if stats.is_quota_exceeded: _logger.info(_('Download quota exceeded.')) _logger.info(_('Exiting with status {0}.').format(self._exit_code)) def _print_ssl_error(self): '''Print an invalid SSL certificate warning.''' _logger.info(_('A SSL certificate could not be verified.')) _logger.info(_('To ignore and proceed insecurely, ' 'use ‘--no-check-certificate’.'))
def test_adjustable_semaphore(self): semaphore = AdjustableSemaphore(value=2) yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) # value = 1 yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) # value = 2 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.set_max(3) self.assertEqual(3, semaphore.max) yield semaphore.acquire( deadline=datetime.timedelta(seconds=0.1)) # value = 3 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.set_max(1) self.assertEqual(1, semaphore.max) try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.release() # value = 2 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.release() # value = 1 try: yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) except toro.Timeout: pass else: self.fail() semaphore.release() # value = 0 yield semaphore.acquire(deadline=datetime.timedelta(seconds=0.1)) semaphore.release() self.assertRaises(ValueError, semaphore.release) def set_neg_max(): semaphore.set_max(-1) self.assertRaises(ValueError, set_neg_max)