def pop(self) -> Optional[DataItem]: line = next(self._file_obj, None) if line is not None: line = line.strip() # send only non-empty lines of a file to the pipeline if line: item = DataItem() item.payload["_id"] = line return item else: self.stop()
def test_batch_concurrent_stage_container2(): manager = Manager() source = SourceContainer() items = [DataItem() for _ in range(10)] for item in items: item.payload["text"] = "something" source.set(ListSource(items)) container = BatchConcurrentStageContainer( "test3", BatchTextGenerator(), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(source) container.run() for _ in range(10): source.pop_into_queue() time.sleep(2) assert list(_get_items(container)) container.terminate() source.prepend_item(None) time.sleep(1) assert container.is_terminated() container.shutdown()
def process_async(self, item: DataItem, callback: Optional[Callable[[DataItem], Any]] = None): """ Process a single item asynchronously through the pipeline, stages may run concurrently. The call returns immediately, processed items are retrieved with :meth:`.Pipeline.get_item` :param callback: A function to call after a successful process of the item """ _logger.debug( f"Processing asynchronously {item} on stages: {self._log_stages()}" ) if callback is not None: item.set_callback(callback) self._source_container.prepend_item(item) self._start_pipeline_executor()
def pop(self): self.counter += 1 if self.counter > self.total: self.stop() return item = DataItem() item.payload.update({"text": random_text(), "count": self.counter}) return item
def process(stage: Stage, item: DataItem, error_manager: ErrorManager) -> DataItem: """ Execute the :meth:`.stage.Stage.process` method of a stage for an item """ if error_manager.check_critical_errors(item): return item time1 = time.time() try: _logger.debug(f"{stage} is processing {item}") processed_item = stage.process(item) _logger.debug(f"{stage} has finished processing {processed_item}") except Exception as e: _logger.debug(f"{stage} has failed processing {item}") item.set_timing(stage.name, time.time() - time1) error_manager.handle(e, stage, item) return item # this can't be in a finally, otherwise it would register the `error_manager.handle` time processed_item.set_timing(stage.name, time.time() - time1) return processed_item
def test_batch_concurrent_stage_container1(): manager = Manager() source = SourceContainer() source.set(ListSource([DataItem() for _ in range(200)])) previous = BatchStageContainer("test0", BatchTextGenerator(), ErrorManager()) previous.set_previous(source) container = BatchConcurrentStageContainer( "test2", BatchTextReverser(timeout=1.0), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(previous) container.run() for _ in range(10): previous.process() items5 = list(_get_items(container)) assert items5 and all(items5) assert container.count() == len(items5) for _ in range(11): previous.process() items6 = list(_get_items(container)) assert items6 and all(items6) assert all(item.payload.get("text") for item in items5) assert all(item.payload.get("text") for item in items6) assert items5 != items6 assert not container.is_stopped() and not container.is_terminated() container.empty_queues() container.terminate() container = BatchConcurrentStageContainer( "test2", BatchTextReverser(timeout=0.0), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(previous) container.run() queue = container.out_queue for item in items6: queue.put(item) result = list(_get_items(container)) for i, item in enumerate(items6): assert item.payload == result[i].payload, "On item {}".format(i) container.terminate()
def test_batch_stage_container2(): source = SourceContainer() container = BatchStageContainer("test1", BatchTextReverser(), ErrorManager()) items = [DataItem() for _ in range(10)] for item in items: item.payload["text"] = "something" source.set(ListSource(items)) container.set_previous(source) processed = container.process() assert len(processed) == 10 and not any( isinstance(item, Stop) for item in processed) reprocessed = container.process() assert any(isinstance(item, Stop) for item in reprocessed) assert container.is_stopped() and not container.is_terminated()
def test_batch_stage_container1(): manager = Manager() simple_item = DataItem() simple_item.payload["text"] = "hello world" source = SourceContainer() source.set(ListSource([DataItem() for _ in range(200)])) previous = BatchStageContainer("test0", BatchTextGenerator(), ErrorManager()) previous.set_previous(source) container = BatchStageContainer("test1", BatchTextReverser(), ErrorManager()) container.set_previous(previous) previous.process() items1 = container.process() assert len(items1) == container.count() items2 = list(_get_items(container)) assert all(items1) and all(items2) assert all(item.payload.get("text") for item in items1) assert all(item.payload.get("text") for item in items2) assert items1 == items2 previous.process() items3 = container.process() items4 = list(_get_items(container)) assert all(items3) and all(items4) assert all(item.payload.get("text") for item in items3) assert all(item.payload.get("text") for item in items4) assert items1 != items3 assert items3 == items4 assert not container.is_stopped() and not container.is_terminated() container.init_queue(manager.Queue) queue = container.out_queue for item in items4: queue.put(item) result = list(_get_items(container)) for i, item in enumerate(items4): assert item.payload == result[i].payload
def test_source_container(): manager = Manager() data = [DataItem() for _ in range(100)] for i, item in enumerate(data): item.set_metadata("id", i + 1) container = SourceContainer() assert not container.is_set() source = ListSource(data) container.set(source) assert not container.is_stopped() assert container.is_set() item = container.get_processed() assert item.get_metadata("id") == 1 while not isinstance(item, Stop): assert not container.is_stopped() item = container.get_processed() assert container.is_stopped() container = SourceContainer() source = ListSource(data) container.set(source) item = DataItem() item.set_metadata("id", 1001) container.prepend_item(item) item = DataItem() item.set_metadata("id", 1002) container.prepend_item(item) assert container.get_processed().get_metadata("id") == 1001 assert container.get_processed().get_metadata("id") == 1002 assert container.get_processed().get_metadata("id") == 1 assert container.get_processed().get_metadata("id") == 2 item = DataItem() item.set_metadata("id", 1003) container.prepend_item(item) assert container.get_processed().get_metadata("id") == 1003 assert container.get_processed().get_metadata("id") == 3 assert not container.is_stopped() container.init_queue(manager.Queue) queue = container.out_queue item = DataItem() item.set_metadata("id", 1004) queue.put(item) assert container.get_processed().get_metadata("id") == 1004 container.pop_into_queue() assert container.get_processed().get_metadata("id") == 4
def test_error(): item = DataItem() stage = TextReverser() item.add_soft_error(stage.name, ValueError("value error")) item.add_soft_error(stage.name, KeyError("key error")) item.add_critical_error(stage.name, KeyError("key error")) assert item.has_critical_errors() assert item.has_errors() assert len(list(item.soft_errors())) == 2 assert len(list(item.critical_errors())) == 1 stage = TextReverser() item.add_soft_error(stage.name, SoftError()) item.add_critical_error(stage.name, CriticalError()) with pytest.raises(ValueError): item.add_soft_error(stage.name, CriticalError()) with pytest.raises(ValueError): item.add_critical_error(stage.name, SoftError())
def process(self, item: DataItem): if self._file is not None and self._file.name == __file__: item.payload["file"] = self._file.name return item
def test_data(): item = DataItem() assert item.id item.payload["text"] = "prova" item.payload["id"] = "666" item.set_metadata("source", "remote") item.set_metadata("version", 3) assert item.id == "666" assert item.get_metadata("source") == "remote" assert item.get_metadata("version") == 3 assert not item.get_metadata("head")
def test_stage_container(): manager = Manager() simple_item = DataItem() simple_item.payload["text"] = "hello world" source = SourceContainer() source.set(ListSource([DataItem() for _ in range(20)])) previous = StageContainer("test0", TextGenerator(), ErrorManager()) previous.set_previous(source) container = StageContainer("test1", TextReverser(), ErrorManager()) container.set_previous(previous) previous.process() assert container.count() == 0 item1 = container.process() item2 = container.get_processed() assert item1 and item2 assert item1 == item2 assert container.count() == 1 previous.process() item3 = container.process() item4 = container.get_processed() assert container.count() == 2 assert item3 and item4 assert item1 != item3 assert item3 == item4 assert not container.is_stopped() and not container.is_terminated() container.init_queue(manager.Queue) queue = container.out_queue queue.put(item4) assert item4.payload == container.get_processed().payload source = SourceContainer() source.set(ListSource([simple_item])) container.set_previous(source) assert container.process() assert isinstance(container.process(), Stop) assert container.is_stopped() and not container.is_terminated() container = ConcurrentStageContainer( "test2", TextReverser(), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(previous) container.run() previous.process() item5 = container.get_processed(block=True) assert item5 previous.process() item6 = container.get_processed(block=True) assert item6 assert item5 != item6 assert not container.is_stopped() and not container.is_terminated() queue = container.out_queue queue.put(item6) assert item6.payload == container.get_processed().payload container.terminate() container.shutdown() source = SourceContainer() source.set(ListSource([simple_item])) container = ConcurrentStageContainer( "test2", TextReverser(), ErrorManager(), manager.Queue, lambda: ProcessCounter(manager), manager.Event, ) container.set_previous(source) container.run() source.pop_into_queue() assert container.get_processed(block=True) source.pop_into_queue() assert isinstance(container.get_processed(block=True), Stop) container.terminate() source.prepend_item(None) time.sleep(1) assert container.is_terminated() container.shutdown()
def _generator(): for i in itertools.count(): item = DataItem() item.payload['id'] = i item.payload['text'] = random_text() yield item
def test_critical_errors(caplog): stage = TextReverser() manager = ErrorManager() item = DataItem() error = CriticalError() error.with_exception(Exception()) managed_critical_error = manager.handle(error, stage, item) assert not item.has_errors() assert item.has_critical_errors() assert isinstance( next(item.critical_errors()).get_exception(), type(managed_critical_error.get_exception()), ) assert any(caplog.records) manager = ErrorManager().raise_on_critical_error() item = DataItem() with pytest.raises(CriticalError): manager.handle(CriticalError(), stage, item) with pytest.raises(Exception): error = CriticalError().with_exception(Exception()) manager.handle(error, stage, item) assert any(caplog.records) assert item.has_critical_errors() assert not item.has_errors() manager = ErrorManager().no_skip_on_critical_error() item = DataItem() assert manager.handle(CriticalError(), stage, item) is None assert not item.has_errors() assert item.has_critical_errors() assert any(caplog.records) item = DataItem() manager.handle(ValueError(), stage, item) manager.handle(KeyError(), stage, item) manager.handle(KeyError(), stage, item) assert not item.has_errors() assert item.has_critical_errors() assert len(list(item.critical_errors())) == 3 for record in caplog.records: assert "has generated an error" in record.message
def test_manager(caplog): manager = ErrorManager() stage = TextReverser() item = DataItem() manager.handle(SoftError(), stage, item) assert any(caplog.records) assert item.has_errors() assert not item.has_critical_errors() item = DataItem() manager.handle(CriticalError(), stage, item) assert not item.has_errors() assert item.has_critical_errors() assert any(caplog.records) item = DataItem() manager.handle(ValueError(), stage, item) manager.handle(KeyError(), stage, item) manager.handle(KeyError(), stage, item) assert not item.has_errors() assert item.has_critical_errors() assert len(list(item.critical_errors())) == 3 for record in caplog.records: assert "has generated an error" in record.message
def process(self, item: DataItem): for _ in range(self._cycles): item.payload["text"] = item.payload["text"][::-1] return item
def process(self, item: DataItem): item.payload["text"] = random_text() return item