def process_batch(stage: BatchStage, items: Sequence[DataItem], error_manager: ErrorManager) -> List[Optional[DataItem]]: """ Execute the :meth:`.stage.BatchStage.process_batch` method of a batch stage for a batch of items """ ret: List[Optional[DataItem]] = [None] * len(items) to_process = {} for i, item in enumerate(items): if error_manager.check_critical_errors(item): ret[i] = item else: _logger.debug(f"{stage} is going to process {item}") to_process[i] = item time1 = time.time() try: _logger.debug(f"{stage} is processing {len(to_process)} items") processed = stage.process_batch(list(to_process.values())) _logger.debug( f"{stage} has finished processing {len(to_process)} items") except Exception as e: _logger.debug( f"{stage} had failures in processing {len(to_process)} items") spent = (time.time() - time1) / (len(to_process) or 1.0) for i, item in to_process.items(): item.set_timing(stage.name, spent) error_manager.handle(e, stage, item) ret[i] = item return ret spent = (time.time() - time1) / (len(to_process) or 1.0) for n, i in enumerate(to_process.keys()): item = processed[n] item.set_timing(stage.name, spent) ret[i] = item return ret
def test_batch_errors(caplog): pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(22)).append_stage("reverser", BatchTextReverser( size=5)).append_stage("error", BatchErrorStage(size=3)).build()) for item in pipeline.run(): assert item.has_errors() assert item.get_timing("reverser") assert item.get_timing("error") error = next(item.soft_errors()) assert error.get_exception() is None assert str(error) == "test pipeline error" assert pipeline.count == 22 assert any(caplog.records) pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(28)).append_stage("reverser", BatchTextReverser( size=8)).append_stage("error", BatchErrorStage(size=7)).append_stage( "duplicator", BatchTextDuplicator(size=5)).build()) for item in pipeline.run(): assert item.has_errors() assert item.get_timing("reverser") assert item.get_timing("duplicator") assert any(k.startswith("text_") for k in item.payload.keys()) assert item.get_timing("error") error = next(item.soft_errors()) assert error.get_exception() is None assert str(error) == "test pipeline error" assert pipeline.count == 28 assert any(caplog.records) pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(10)).append_stage("reverser", BatchTextReverser( size=3)).append_stage("error1", BatchExceptionStage(size=7)).append_stage( "error2", BatchErrorStage(size=1)).build()) for item in pipeline.run(): assert item.has_critical_errors() assert item.get_timing("reverser") assert item.get_timing("error1") >= 0.0003 assert not item.get_timing("error2") for error in item.critical_errors(): assert isinstance(error.get_exception(), Exception) assert (str(error.get_exception()) == "test exception" and str(error) != "test pipeline error") assert pipeline.count == 10 assert any(caplog.records) with pytest.raises(Exception): pipeline = (_pipeline().set_source(RandomTextSource(10)).append_stage( "reverser", BatchTextReverser(size=4)).append_stage( "error", BatchExceptionStage(size=3)).build()) for _ in pipeline.run(): pass assert pipeline.count == 1
def test_batch_concurrent_stage_container1(): manager = Manager() source = SourceContainer() source.set(ListSource([DataItem() for _ in range(200)])) previous = BatchStageContainer("test0", BatchTextGenerator(), ErrorManager()) previous.set_previous(source) container = BatchConcurrentStageContainer( "test2", BatchTextReverser(timeout=1.0), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(previous) container.run() for _ in range(10): previous.process() items5 = list(_get_items(container)) assert items5 and all(items5) assert container.count() == len(items5) for _ in range(11): previous.process() items6 = list(_get_items(container)) assert items6 and all(items6) assert all(item.payload.get("text") for item in items5) assert all(item.payload.get("text") for item in items6) assert items5 != items6 assert not container.is_stopped() and not container.is_terminated() container.empty_queues() container.terminate() container = BatchConcurrentStageContainer( "test2", BatchTextReverser(timeout=0.0), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(previous) container.run() queue = container.out_queue for item in items6: queue.put(item) result = list(_get_items(container)) for i, item in enumerate(items6): assert item.payload == result[i].payload, "On item {}".format(i) container.terminate()
def test_batch_concurrent_stage_container2(): manager = Manager() source = SourceContainer() items = [DataItem() for _ in range(10)] for item in items: item.payload["text"] = "something" source.set(ListSource(items)) container = BatchConcurrentStageContainer( "test3", BatchTextGenerator(), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(source) container.run() for _ in range(10): source.pop_into_queue() time.sleep(2) assert list(_get_items(container)) container.terminate() source.prepend_item(None) time.sleep(1) assert container.is_terminated() container.shutdown()
def batch_stage_executor( stage: BatchStage, in_queue: ItemsQueue, out_queue: ItemsQueue, error_manager: ErrorManager, terminated: Event, has_started_counter: ConcurrentCounter, counter: ConcurrentCounter, ): """ Consume items in batches from an input queue, process and put them in a output queue, indefinitely, until a termination event is set """ if isinstance(counter, ProcessCounter): # call these only if the stage and the error manager are copies of the original, # ergo this executor is running in a child process error_manager.on_start() stage.on_start() has_started_counter += 1 while True: if terminated.is_set() and in_queue.empty(): return items = [] try: for _ in range(stage.size): item = in_queue.get(block=True, timeout=stage.timeout) # give priority to the Stop event item if isinstance(item, Stop): out_queue.put(item, block=True) elif item is not None: items.append(item) in_queue.task_done() except queue.Empty: if not any(items): continue if any(items): try: items = process_batch(stage, items, error_manager) except Exception as e: raise e else: for item in items: if item is not None: out_queue.put(item, block=True) if not isinstance(item, Stop): counter += 1
def process(stage: Stage, item: DataItem, error_manager: ErrorManager) -> DataItem: """ Execute the :meth:`.stage.Stage.process` method of a stage for an item """ if error_manager.check_critical_errors(item): return item time1 = time.time() try: _logger.debug(f"{stage} is processing {item}") processed_item = stage.process(item) _logger.debug(f"{stage} has finished processing {processed_item}") except Exception as e: _logger.debug(f"{stage} has failed processing {item}") item.set_timing(stage.name, time.time() - time1) error_manager.handle(e, stage, item) return item # this can't be in a finally, otherwise it would register the `error_manager.handle` time processed_item.set_timing(stage.name, time.time() - time1) return processed_item
def stage_executor( stage: Stage, in_queue: ItemsQueue, out_queue: ItemsQueue, error_manager: ErrorManager, terminated: Event, has_started_counter: ConcurrentCounter, counter: ConcurrentCounter, ): """ Consume items from an input queue, process and put them in a output queue, indefinitely, until a termination event is set """ if isinstance(counter, ProcessCounter): # call these only if the stage and the error manager are copies of the original, # ergo this executor is running in a child process error_manager.on_start() stage.on_start() has_started_counter += 1 while True: if terminated.is_set() and in_queue.empty(): return try: item = in_queue.get(block=True, timeout=CONCURRENCY_WAIT) except queue.Empty: continue if isinstance(item, Stop): out_queue.put(item, block=True) in_queue.task_done() elif item is not None: try: item = process(stage, item, error_manager) except Exception as e: raise e else: if item is not None: out_queue.put(item, block=True) if not isinstance(item, Stop): counter += 1 finally: in_queue.task_done()
def test_manager(caplog): manager = ErrorManager() stage = TextReverser() item = DataItem() manager.handle(SoftError(), stage, item) assert any(caplog.records) assert item.has_errors() assert not item.has_critical_errors() item = DataItem() manager.handle(CriticalError(), stage, item) assert not item.has_errors() assert item.has_critical_errors() assert any(caplog.records) item = DataItem() manager.handle(ValueError(), stage, item) manager.handle(KeyError(), stage, item) manager.handle(KeyError(), stage, item) assert not item.has_errors() assert item.has_critical_errors() assert len(list(item.critical_errors())) == 3 for record in caplog.records: assert "has generated an error" in record.message
def __init__( self, max_init_workers: Optional[int] = None, max_queues_size: int = MAX_QUEUES_SIZE, ): """ :param max_init_workers: Number of workers to use for concurrent initialization of stages, default the number of CPUs :param max_queues_size: Maximum size of any queue instanced for the pipeline (stage input and output queues) """ self._containers = LastOrderedDict() self._error_manager = ErrorManager() self._max_init_workers = max_init_workers self._init_executor = None self._wait_previous_executor = None self._pipeline_executor = None self._max_queues_size = max_queues_size self._out_queue = None self._enqueue_source = False self._sync_manager = None # an empty source, on which we can only occasionally send items self._source_container = SourceContainer() self._count = 0 self._executors_ready = False
def test_batch_stage_container2(): source = SourceContainer() container = BatchStageContainer("test1", BatchTextReverser(), ErrorManager()) items = [DataItem() for _ in range(10)] for item in items: item.payload["text"] = "something" source.set(ListSource(items)) container.set_previous(source) processed = container.process() assert len(processed) == 10 and not any( isinstance(item, Stop) for item in processed) reprocessed = container.process() assert any(isinstance(item, Stop) for item in reprocessed) assert container.is_stopped() and not container.is_terminated()
def test_batch_stage_container1(): manager = Manager() simple_item = DataItem() simple_item.payload["text"] = "hello world" source = SourceContainer() source.set(ListSource([DataItem() for _ in range(200)])) previous = BatchStageContainer("test0", BatchTextGenerator(), ErrorManager()) previous.set_previous(source) container = BatchStageContainer("test1", BatchTextReverser(), ErrorManager()) container.set_previous(previous) previous.process() items1 = container.process() assert len(items1) == container.count() items2 = list(_get_items(container)) assert all(items1) and all(items2) assert all(item.payload.get("text") for item in items1) assert all(item.payload.get("text") for item in items2) assert items1 == items2 previous.process() items3 = container.process() items4 = list(_get_items(container)) assert all(items3) and all(items4) assert all(item.payload.get("text") for item in items3) assert all(item.payload.get("text") for item in items4) assert items1 != items3 assert items3 == items4 assert not container.is_stopped() and not container.is_terminated() container.init_queue(manager.Queue) queue = container.out_queue for item in items4: queue.put(item) result = list(_get_items(container)) for i, item in enumerate(items4): assert item.payload == result[i].payload
def test_stage_container(): manager = Manager() simple_item = DataItem() simple_item.payload["text"] = "hello world" source = SourceContainer() source.set(ListSource([DataItem() for _ in range(20)])) previous = StageContainer("test0", TextGenerator(), ErrorManager()) previous.set_previous(source) container = StageContainer("test1", TextReverser(), ErrorManager()) container.set_previous(previous) previous.process() assert container.count() == 0 item1 = container.process() item2 = container.get_processed() assert item1 and item2 assert item1 == item2 assert container.count() == 1 previous.process() item3 = container.process() item4 = container.get_processed() assert container.count() == 2 assert item3 and item4 assert item1 != item3 assert item3 == item4 assert not container.is_stopped() and not container.is_terminated() container.init_queue(manager.Queue) queue = container.out_queue queue.put(item4) assert item4.payload == container.get_processed().payload source = SourceContainer() source.set(ListSource([simple_item])) container.set_previous(source) assert container.process() assert isinstance(container.process(), Stop) assert container.is_stopped() and not container.is_terminated() container = ConcurrentStageContainer( "test2", TextReverser(), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(previous) container.run() previous.process() item5 = container.get_processed(block=True) assert item5 previous.process() item6 = container.get_processed(block=True) assert item6 assert item5 != item6 assert not container.is_stopped() and not container.is_terminated() queue = container.out_queue queue.put(item6) assert item6.payload == container.get_processed().payload container.terminate() container.shutdown() source = SourceContainer() source.set(ListSource([simple_item])) container = ConcurrentStageContainer( "test2", TextReverser(), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(source) container.run() source.pop_into_queue() assert container.get_processed(block=True) source.pop_into_queue() assert isinstance(container.get_processed(block=True), Stop) container.terminate() source.prepend_item(None) time.sleep(1) assert container.is_terminated() container.shutdown()
def _pipeline(*args, **kwargs): return Pipeline(*args, **kwargs).set_error_manager( ErrorManager().raise_on_critical_error())
def test_critical_errors(caplog): stage = TextReverser() manager = ErrorManager() item = DataItem() error = CriticalError() error.with_exception(Exception()) managed_critical_error = manager.handle(error, stage, item) assert not item.has_errors() assert item.has_critical_errors() assert isinstance( next(item.critical_errors()).get_exception(), type(managed_critical_error.get_exception()), ) assert any(caplog.records) manager = ErrorManager().raise_on_critical_error() item = DataItem() with pytest.raises(CriticalError): manager.handle(CriticalError(), stage, item) with pytest.raises(Exception): error = CriticalError().with_exception(Exception()) manager.handle(error, stage, item) assert any(caplog.records) assert item.has_critical_errors() assert not item.has_errors() manager = ErrorManager().no_skip_on_critical_error() item = DataItem() assert manager.handle(CriticalError(), stage, item) is None assert not item.has_errors() assert item.has_critical_errors() assert any(caplog.records) item = DataItem() manager.handle(ValueError(), stage, item) manager.handle(KeyError(), stage, item) manager.handle(KeyError(), stage, item) assert not item.has_errors() assert item.has_critical_errors() assert len(list(item.critical_errors())) == 3 for record in caplog.records: assert "has generated an error" in record.message
class Pipeline: def __init__( self, max_init_workers: Optional[int] = None, max_queues_size: int = MAX_QUEUES_SIZE, ): """ :param max_init_workers: Number of workers to use for concurrent initialization of stages, default the number of CPUs :param max_queues_size: Maximum size of any queue instanced for the pipeline (stage input and output queues) """ self._containers = LastOrderedDict() self._error_manager = ErrorManager() self._max_init_workers = max_init_workers self._init_executor = None self._wait_previous_executor = None self._pipeline_executor = None self._max_queues_size = max_queues_size self._out_queue = None self._enqueue_source = False self._sync_manager = None # an empty source, on which we can only occasionally send items self._source_container = SourceContainer() self._count = 0 self._executors_ready = False def _new_mp_queue(self) -> ItemsQueue: """ Construct queue for multiprocessing communication """ if self._sync_manager is None: self._sync_manager = Manager() return self._sync_manager.Queue(maxsize=self._max_queues_size) def _new_queue(self) -> ItemsQueue: """ Construct queue for communication """ return Queue(maxsize=self._max_queues_size) def _new_mp_event(self) -> Event: """ Construct synchronization event for multiprocessing """ if self._sync_manager is None: self._sync_manager = Manager() return self._sync_manager.Event() @staticmethod def _new_event() -> Event: """ Construct synchronization event """ return Event() def _new_mp_counter(self) -> ProcessCounter: """ Construct a safe counter for multiprocessing """ if self._sync_manager is None: self._sync_manager = Manager() return ProcessCounter(self._sync_manager) @staticmethod def _new_counter() -> ThreadCounter: """ Construct a safe counter for threads """ return ThreadCounter() def _wait_executors(self, wait_seconds: float = CONCURRENCY_WAIT): """ Wait for all containers to start :param wait_seconds: Recurrently wait these seconds for all stage initializers to finish """ if self._executors_ready: return if self._init_executor is not None: self._init_executor.shutdown(wait=True) self._init_executor = None while not all(self._containers.values()): time.sleep(wait_seconds) self._wait_previous_executor.shutdown(wait=True) for name, container in self._containers.items(): if isinstance( container, (ConcurrentStageContainer, BatchConcurrentStageContainer)): container.run() # finalize initialization of the error manager shared by this and other stage threads self._error_manager.on_start() self._executors_ready = True _logger.debug("Pipeline ready to run") def shutdown(self): if self._out_queue is not None: self._out_queue.join() # if self._init_executor is not None: # self._init_executor.shutdown() # FIXME stage shutdown may raise exception, the executor gets stuck # for name, stage in self._containers.items(): # if isinstance(stage, (ConcurrentStageContainer, BatchConcurrentStageContainer)): # stage.shutdown() if self._sync_manager is not None: self._sync_manager.shutdown() def __del__(self): self.shutdown() def build(self) -> Pipeline: """ Pipeline builder method """ if not any(self._containers): raise ValueError("Must append at least a stage") _logger.debug(f"Building the pipeline on stages: {self._log_stages()}") self._wait_executors() return self def run(self) -> Generator[DataItem, None, None]: """ Run the pipeline given a source and a concatenation of stages. Get the sequence of items through iteration :return: Iterator over processed items :raises ValueError: When a source has not been set for the pipeline """ if not self._source_container.is_set(): raise ValueError("Set the data source for this pipeline") _logger.debug(f"Running the pipeline on stages: {self._log_stages()}") counter = 0 last_stage_name = self._last_stage_name() terminator_thread = None source_thread = None # in case the first stage is concurrent if self._enqueue_source: source_thread = Thread( target=self._source_container.pop_into_queue) source_thread.start() while True: for name, container in self._containers.items(): try: # concurrent stages run by themselves in threads/processes if not isinstance( container, (ConcurrentStageContainer, BatchConcurrentStageContainer), ): container.process() # but me must periodically check for errors else: container.check_errors() except Exception as e: self.stop() # TODO in case of errors we loose pending items! self._terminate_all(force=True) self.shutdown() self._count += 1 raise e # retrieve finally processed items from the last stage if name == last_stage_name: for _ in range(container.size if isinstance( container, BatchStageContainer) else 1): item = container.get_processed() if item is not None: if not isinstance(item, Stop): yield item counter += 1 self._count += 1 # if a stop is finally signaled, start termination of all containers elif (not self._all_terminated() and terminator_thread is None): terminator_thread = Thread( target=self._terminate_all) terminator_thread.start() # an item is None if the final output queue is empty else: break # exit the loop only when all items have been returned if self._all_empty() and counter >= self._source_container.count(): if source_thread is not None: source_thread.join() if terminator_thread is not None: terminator_thread.join() self.shutdown() return @property def count(self) -> int: """ Get the number of items processed by all executed runs, also for items which have failed :return: Count of processed items """ return self._count def _terminate_all(self, force: bool = False, wait_seconds: float = CONCURRENCY_WAIT): """ Terminate all running containers :param force: If True do not wait for a container to process all items produced by the source :param wait_seconds: Time to wait before pinging again a container for its termination """ _logger.debug("Terminating the pipeline") # scroll the pipeline by its order and terminate stages after the relative queues are empty for container in self._containers.values(): if not force: # ensure the stage has processed all source items while container.count() < self._source_container.count(): time.sleep(wait_seconds) container.terminate() if isinstance(container, ConcurrentStageContainer): if force: # empty the queues, losing pending items container.empty_queues() while not container.is_terminated(): time.sleep(wait_seconds) container.queues_join() while not container.queues_empty(): time.sleep(wait_seconds) _logger.debug("Termination done") def _all_terminated(self) -> bool: """ Check if all containers have been alerted for termination and are exited """ return all(container.is_terminated() for container in self._containers.values()) def _all_empty(self) -> bool: """ Check if all containers are terminated and there are not items left in the queues """ return self._all_terminated() and all( container.queues_empty() for container in self._containers.values() if isinstance(container, (ConcurrentStageContainer, BatchConcurrentStageContainer))) def process(self, item: DataItem) -> DataItem: """ Process a single item synchronously (no concurrency) through the pipeline """ _logger.debug(f"Processing {item} on stages: {self._log_stages()}") last_stage_name = self._containers.last_key() self._source_container.prepend_item(item) for name, container in self._containers.items(): container.process() if name == last_stage_name: return container.get_processed(block=True) def process_async(self, item: DataItem, callback: Optional[Callable[[DataItem], Any]] = None): """ Process a single item asynchronously through the pipeline, stages may run concurrently. The call returns immediately, processed items are retrieved with :meth:`.Pipeline.get_item` :param callback: A function to call after a successful process of the item """ _logger.debug( f"Processing asynchronously {item} on stages: {self._log_stages()}" ) if callback is not None: item.set_callback(callback) self._source_container.prepend_item(item) self._start_pipeline_executor() def stop(self): """ Tell the source to stop to generate items and consequently the pipeline """ self._source_container.stop() def get_item(self, block: bool = True) -> DataItem: """ Get a single item from the asynchronous execution of the pipeline on single items from :meth:`.Pipeline.process_async` :param block: If True wait indefinitely for the next processed item :raises ValueError: When there is not output queue set, the pipeline is not running asynchronously :raises queue.Empty: When we do not block and the queue is empty """ if self._out_queue is not None: item = self._out_queue.get(block) self._out_queue.task_done() return item else: raise ValueError( "No pipeline is running asynchronously, not item can be retrieved from the output queue" ) def set_source(self, source: Source) -> Pipeline: """ Set the source of the pipeline: a subclass of :class:`.stage.Source` """ self._source_container.set(source) return self def set_error_manager(self, error_manager: ErrorManager) -> Pipeline: """ Set the error manager for handling errors from each stage item processing """ self._error_manager = error_manager for container in self._containers.values(): container.set_error_manager(self._error_manager) return self def _last_stage_name(self) -> str: if self._containers: return self._containers.last_key() def _last_container(self) -> BaseContainer: if self._containers: return self._containers[self._last_stage_name()] else: return self._source_container def _wait_for_previous( self, container: ConnectedStageMixin, last_stage_name: str, wait_seconds: float = CONCURRENCY_WAIT, ): """ Given a container we want to append to the pipeline, wait for the last one (added to the pipeline) to be created :param container: A container to add to the pipeline :param last_stage_name: Name of the last stage currently in the pipeline :param wait_seconds: Time to recurrently wait the construction of the container relative to the last stage in the pipeline """ def _waiter(): if last_stage_name is not None: while self._containers[last_stage_name] is None: time.sleep(wait_seconds) container.set_previous(self._containers[last_stage_name]) else: container.set_previous(self._source_container) executor = self._get_wait_previous_executor() executor.submit(_waiter) def _build_container(self, name: str, stage: StageType, concurrency: int, parallel: bool) -> BaseContainer: """ Get a new container instance according to the pipeline configuration :param name: Stage name :param stage: A stage instance :param concurrency: Number of concurrent stage executions, if 0 then just create the non-concurrent containers :param parallel: If True use multiprocessing, otherwise threads """ if concurrency <= 0: constructor = (BatchStageContainer if isinstance( stage, BatchStage) else StageContainer) # if not concurrent we must explicitly finalize initialization of this single stage object stage.on_start() return constructor(name, stage, self._error_manager) else: constructor = (BatchConcurrentStageContainer if isinstance( stage, BatchStage) else ConcurrentStageContainer) if parallel: return constructor( name, stage, self._error_manager, self._new_mp_queue, self._new_mp_counter, self._new_mp_event, concurrency, parallel, ) else: # if the stage is executed on multiple threads we must finalize initialization once, # while on multiprocessing each process executor calls it for its own copy of the stage stage.on_start() return constructor( name, stage, self._error_manager, self._new_queue, self._new_counter, self._new_event, concurrency, parallel, ) def get_stage(self, name: str) -> StageType: """ Get a stage instance by its name """ return self._containers.get(name).stage def append_stage( self, name: str, stage: StageType, concurrency: int = 0, parallel: bool = False, ) -> Pipeline: """ Append a stage to the pipeline just after the last one appended, or after the source if it is the first stage :param name: Name for identify the stage in the pipeline, it is also set in the stage and it must be unique in the pipeline :param stage: Instance of a stage :param concurrency: Number of concurrent stage executions, if 0 then threads/processes won't be involved for this stage :param parallel: If True use multiprocessing, otherwise threads """ self._executors_ready = False # FIXME here we force a BatchStage to run on a thread, but we would leave it on the main thread if concurrency < 1 and isinstance(stage, BatchStage): parallel = False concurrency = 1 self._check_stage_name(name) container = self._build_container(name, stage, concurrency, parallel) if concurrency > 0: # if it is concurrent and it is the first stage, make the source working on a output queue if not self._containers: self._enqueue_source = True self._wait_for_previous( container, self._last_stage_name()) # wait that previous stage is initialized self._containers[name] = container return self def append_stage_concurrently( self, name: str, stage_class: Callable, args: Sequence = None, kwargs: Mapping = None, concurrency: int = 0, parallel: bool = False, ) -> Pipeline: """ Append a stage class to the pipeline just after the last one appended, or after the source if it is the first stage. The stage construction will be executed concurrently respect to the general pipeline construction :param name: Name for identify the stage in the pipeline, it is also set in the stage and it must be unique in the pipeline :param stage_class: Class of a stage :param args: List of arguments for the stage constructor :param kwargs: Dictionary of keyed arguments for the stage constructor :param concurrency: Number of concurrent stage executions, if 0 then threads/processes won't be involved for this stage :param parallel: If True use multiprocessing, otherwise threads """ self._executors_ready = False # FIXME here we force a BatchStage to run on a thread, but we would leave it on the main thread if concurrency < 1 and issubclass(stage_class, BatchStage): parallel = False concurrency = 1 if kwargs is None: kwargs = {} if args is None: args = [] self._check_stage_name(name) # if it is concurrent and it is the first stage, make the source working on a output queue if concurrency > 0 and not self._containers: self._enqueue_source = True last_stage_name = self._last_stage_name() # set it immediately so the order of the calls of this method is followed in `_containers` self._containers[name] = None future = self._get_init_executor(parallel).submit( stage_class, *args, **kwargs) def append_stage(stage_future: Future): stage = stage_future.result() container = self._build_container(name, stage, concurrency, parallel) self._wait_for_previous(container, last_stage_name) self._containers[name] = container future.add_done_callback(append_stage) return self def _get_init_executor(self, parallel: bool = False) -> Executor: """ Get a pool executor for concurrent stage initialization :param parallel: True if the executor uses multiprocessing, otherwise treads """ if self._init_executor is None: executor = ThreadPoolExecutor if not parallel else ProcessPoolExecutor self._init_executor = executor(max_workers=self._max_init_workers) return self._init_executor def _get_wait_previous_executor(self) -> Executor: """ Get a pool executor for the function that will recurrently wait for a container to be ready """ if self._wait_previous_executor is None: self._wait_previous_executor = ThreadPoolExecutor() return self._wait_previous_executor def _start_pipeline_executor(self) -> Thread: """ Get a thread where to run a pipeline that accepts asynchronous processing of single items """ if self._pipeline_executor is None: self._init_out_queue() def pipeline_runner(): for item in self.run(): item.callback() self._out_queue.put(item) self._pipeline_executor = Thread(target=pipeline_runner, daemon=True) self._pipeline_executor.start() return self._pipeline_executor def _check_stage_name(self, name: str): """ Check if a stage name is not already defined in the pipeline :raises ValueError: Stage name is already defined in the pipeline """ if name in self._containers: raise ValueError( f"The stage name {name} is already used in this pipeline") def _init_out_queue(self): """ Get the internal output pipeline for asynchronous processing of single items """ self._out_queue = self._new_queue() def _log_stages(self): return ", ".join(self._containers.keys())
def test_errors(caplog): pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(10)).append_stage("reverser", TextReverser()).append_stage( "error", ErrorStage()).build()) for item in pipeline.run(): assert item.has_errors() assert item.get_timing("reverser") assert item.get_timing("error") error = next(item.soft_errors()) assert error.get_exception() is None assert str(error) == "test pipeline error" assert all("stage error has generated an error" in record.msg.lower() for record in caplog.records if record.levelno == logging.ERROR) assert pipeline.count == 10 pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(10)).append_stage("reverser", TextReverser()).append_stage( "error", ErrorStage()).append_stage( "duplicator", TextDuplicator()).build()) caplog.clear() for item in pipeline.run(): assert item.has_errors() assert item.get_timing("reverser") assert item.get_timing("duplicator") assert any(k.startswith("text_") for k in item.payload.keys()) assert item.get_timing("error") error = next(item.soft_errors()) assert error.get_exception() is None assert str(error) == "test pipeline error" assert all("stage error has generated an error" in record.msg.lower() for record in caplog.records if record.levelno == logging.ERROR) assert pipeline.count == 10 pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(10)).append_stage("reverser", TextReverser()).append_stage( "error1", CriticalIOErrorStage()).build()) for item in pipeline.run(): assert item.has_critical_errors() assert item.get_timing("reverser") assert item.get_timing("error1") for error in item.critical_errors(): assert isinstance(error.get_exception(), IOError) assert str(error) == "test pipeline critical IO error" pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source( RandomTextSource(10)).append_stage("reverser", TextReverser()).append_stage( "error1", ExceptionStage()).append_stage( "error2", ErrorStage()).build()) caplog.clear() for item in pipeline.run(): assert item.has_critical_errors() assert item.get_timing("reverser") assert item.get_timing("error1") assert not item.get_timing("error2") for error in item.critical_errors(): assert isinstance(error.get_exception(), Exception) assert (str(error) == "test exception" and str(error.get_exception()) == "test exception" and str(error) != "test pipeline error") assert all("stage error1 has generated an error" in record.msg.lower() for record in caplog.records if record.levelno == logging.ERROR) assert pipeline.count == 10 with pytest.raises(Exception): pipeline = (_pipeline().set_source(RandomTextSource(10)).append_stage( "reverser", TextReverser()).append_stage("error", ExceptionStage()).build()) try: for _ in pipeline.run(): pass except Exception: assert 'Exception("test exception")' in traceback.format_exc() raise assert pipeline.count == 1 pipeline = (_pipeline().set_error_manager( ErrorManager().no_skip_on_critical_error()).set_source( RandomTextSource(10)).append_stage( "reverser1", TextReverser()).append_stage( "error", ExceptionStage()).append_stage("reverser2", TextReverser()).build()) for item in pipeline.run(): assert item.get_timing("reverser1") assert item.get_timing("error") assert item.get_timing("reverser2") assert pipeline.count == 10