def __init__(self, starting_batch_size, max_workers, retry_exceptions=RETRY_EXCEPTIONS): self.batch_size = starting_batch_size self.max_workers = max_workers # Using bounded executor prevents unlimited queue growth # and allows monitoring in-progress futures and failing fast in case of errors. self.executor = FailSafeExecutor(BoundedExecutor(1, self.max_workers)) self.retry_exceptions = retry_exceptions self.progress_logger = ProgressLogger()
class BatchWorkExecutor: def __init__(self, starting_batch_size, max_workers, retry_exceptions=RETRY_EXCEPTIONS): self.batch_size = starting_batch_size self.max_workers = max_workers # Using bounded executor prevents unlimited queue growth # and allows monitoring in-progress futures and failing fast in case of errors. self.executor = FailSafeExecutor(BoundedExecutor(1, self.max_workers)) self.retry_exceptions = retry_exceptions self.progress_logger = ProgressLogger() def execute(self, work_iterable, work_handler, total_items=None): self.progress_logger.start(total_items=total_items) for batch in dynamic_batch_iterator(work_iterable, lambda: self.batch_size): self.executor.submit(self._fail_safe_execute, work_handler, batch) # Check race conditions def _fail_safe_execute(self, work_handler, batch): try: work_handler(batch) except self.retry_exceptions: batch_size = self.batch_size # Reduce the batch size. Subsequent batches will be 2 times smaller if batch_size == len(batch) and batch_size > 1: self.batch_size = int(batch_size / 2) # For the failed batch try handling items one by one for item in batch: work_handler([item]) self.progress_logger.track(len(batch)) def shutdown(self): self.executor.shutdown() self.progress_logger.finish()
def test_progress_logger(): logger_mock = LoggerMock() progress_logger = ProgressLogger(logger=logger_mock, log_item_step=1000) progress_logger.start() [progress_logger.track(100) for _ in range(100)] progress_logger.finish() assert len(logger_mock.logs) == 12 assert logger_mock.logs[0] == 'Started work.' assert logger_mock.logs[1] == '1000 items processed.' assert logger_mock.logs[11].startswith('Finished work. Total items processed: 10000. Took ')
def test_progress_logger_with_total_items(): logger_mock = LoggerMock() progress_logger = ProgressLogger(logger=logger_mock, log_percentage_step=5) progress_logger.start(total_items=1234) [progress_logger.track(99) for _ in range(100)] progress_logger.finish() assert len(logger_mock.logs) == 102 assert logger_mock.logs[0] == 'Started work. Items to process: 1234.' assert logger_mock.logs[1] == '99 items processed. Progress is 8%.' assert logger_mock.logs[100] == '9900 items processed. Progress is 802%!!!' assert logger_mock.logs[101].startswith('Finished work. Total items processed: 9900. Took ')
class BatchWorkExecutor: def __init__(self, starting_batch_size, max_workers, retry_exceptions=RETRY_EXCEPTIONS, max_retries=5): self.batch_size = starting_batch_size self.max_batch_size = starting_batch_size self.latest_batch_size_change_time = None self.max_workers = max_workers # Using bounded executor prevents unlimited queue growth # and allows monitoring in-progress futures and failing fast in case of errors. self.executor = FailSafeExecutor(BoundedExecutor(1, self.max_workers)) self.retry_exceptions = retry_exceptions self.max_retries = max_retries self.progress_logger = ProgressLogger() self.logger = logging.getLogger('BatchWorkExecutor') def execute(self, work_iterable, work_handler, total_items=None): self.progress_logger.start(total_items=total_items) for batch in dynamic_batch_iterator(work_iterable, lambda: self.batch_size): self.executor.submit(self._fail_safe_execute, work_handler, batch) def _fail_safe_execute(self, work_handler, batch): try: work_handler(batch) self._try_increase_batch_size(len(batch)) except self.retry_exceptions: self.logger.exception( 'An exception occurred while executing work_handler.') self._try_decrease_batch_size(len(batch)) self.logger.info( 'The batch of size {} will be retried one item at a time.'. format(len(batch))) for item in batch: execute_with_retries(work_handler, [item], max_retries=self.max_retries, retry_exceptions=self.retry_exceptions) self.progress_logger.track(len(batch)) # Some acceptable race conditions are possible def _try_decrease_batch_size(self, current_batch_size): batch_size = self.batch_size if batch_size == current_batch_size and batch_size > 1: new_batch_size = int(current_batch_size / 2) self.logger.info( 'Reducing batch size to {}.'.format(new_batch_size)) self.batch_size = new_batch_size self.latest_batch_size_change_time = time.time() def _try_increase_batch_size(self, current_batch_size): if current_batch_size * 2 <= self.max_batch_size: current_time = time.time() latest_batch_size_change_time = self.latest_batch_size_change_time seconds_since_last_change = current_time - latest_batch_size_change_time \ if latest_batch_size_change_time is not None else 0 if seconds_since_last_change > BATCH_CHANGE_COOLDOWN_PERIOD_SECONDS: new_batch_size = current_batch_size * 2 self.logger.info( 'Increasing batch size to {}.'.format(new_batch_size)) self.batch_size = new_batch_size self.latest_batch_size_change_time = current_time def shutdown(self): self.executor.shutdown() self.progress_logger.finish()