Beispiel #1
0
 def pop(self) -> Optional[DataItem]:
     line = next(self._file_obj, None)
     if line is not None:
         line = line.strip()
         # send only non-empty lines of a file to the pipeline
         if line:
             item = DataItem()
             item.payload["_id"] = line
             return item
     else:
         self.stop()
Beispiel #2
0
def test_batch_concurrent_stage_container2():
    manager = Manager()
    source = SourceContainer()
    items = [DataItem() for _ in range(10)]
    for item in items:
        item.payload["text"] = "something"
    source.set(ListSource(items))
    container = BatchConcurrentStageContainer(
        "test3",
        BatchTextGenerator(),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(source)
    container.run()
    for _ in range(10):
        source.pop_into_queue()
    time.sleep(2)
    assert list(_get_items(container))
    container.terminate()
    source.prepend_item(None)
    time.sleep(1)
    assert container.is_terminated()
    container.shutdown()
Beispiel #3
0
    def process_async(self,
                      item: DataItem,
                      callback: Optional[Callable[[DataItem], Any]] = None):
        """
        Process a single item asynchronously through the pipeline, stages may run concurrently.
        The call returns immediately, processed items are retrieved with :meth:`.Pipeline.get_item`

        :param callback: A function to call after a successful process of the item
        """
        _logger.debug(
            f"Processing asynchronously {item} on stages: {self._log_stages()}"
        )
        if callback is not None:
            item.set_callback(callback)
        self._source_container.prepend_item(item)
        self._start_pipeline_executor()
Beispiel #4
0
 def pop(self):
     self.counter += 1
     if self.counter > self.total:
         self.stop()
         return
     item = DataItem()
     item.payload.update({"text": random_text(), "count": self.counter})
     return item
Beispiel #5
0
def process(stage: Stage, item: DataItem,
            error_manager: ErrorManager) -> DataItem:
    """
    Execute the :meth:`.stage.Stage.process` method of a stage for an item
    """
    if error_manager.check_critical_errors(item):
        return item
    time1 = time.time()
    try:
        _logger.debug(f"{stage} is processing {item}")
        processed_item = stage.process(item)
        _logger.debug(f"{stage} has finished processing {processed_item}")
    except Exception as e:
        _logger.debug(f"{stage} has failed processing {item}")
        item.set_timing(stage.name, time.time() - time1)
        error_manager.handle(e, stage, item)
        return item
    # this can't be in a finally, otherwise it would register the `error_manager.handle` time
    processed_item.set_timing(stage.name, time.time() - time1)
    return processed_item
Beispiel #6
0
def test_batch_concurrent_stage_container1():
    manager = Manager()
    source = SourceContainer()
    source.set(ListSource([DataItem() for _ in range(200)]))
    previous = BatchStageContainer("test0", BatchTextGenerator(),
                                   ErrorManager())
    previous.set_previous(source)
    container = BatchConcurrentStageContainer(
        "test2",
        BatchTextReverser(timeout=1.0),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(previous)
    container.run()
    for _ in range(10):
        previous.process()
    items5 = list(_get_items(container))
    assert items5 and all(items5)
    assert container.count() == len(items5)
    for _ in range(11):
        previous.process()
    items6 = list(_get_items(container))
    assert items6 and all(items6)
    assert all(item.payload.get("text") for item in items5)
    assert all(item.payload.get("text") for item in items6)
    assert items5 != items6
    assert not container.is_stopped() and not container.is_terminated()
    container.empty_queues()
    container.terminate()

    container = BatchConcurrentStageContainer(
        "test2",
        BatchTextReverser(timeout=0.0),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(previous)
    container.run()
    queue = container.out_queue
    for item in items6:
        queue.put(item)
    result = list(_get_items(container))
    for i, item in enumerate(items6):
        assert item.payload == result[i].payload, "On item {}".format(i)
    container.terminate()
Beispiel #7
0
def test_batch_stage_container2():
    source = SourceContainer()
    container = BatchStageContainer("test1", BatchTextReverser(),
                                    ErrorManager())
    items = [DataItem() for _ in range(10)]
    for item in items:
        item.payload["text"] = "something"
    source.set(ListSource(items))
    container.set_previous(source)
    processed = container.process()
    assert len(processed) == 10 and not any(
        isinstance(item, Stop) for item in processed)
    reprocessed = container.process()
    assert any(isinstance(item, Stop) for item in reprocessed)
    assert container.is_stopped() and not container.is_terminated()
Beispiel #8
0
def test_batch_stage_container1():
    manager = Manager()
    simple_item = DataItem()
    simple_item.payload["text"] = "hello world"
    source = SourceContainer()
    source.set(ListSource([DataItem() for _ in range(200)]))
    previous = BatchStageContainer("test0", BatchTextGenerator(),
                                   ErrorManager())
    previous.set_previous(source)
    container = BatchStageContainer("test1", BatchTextReverser(),
                                    ErrorManager())
    container.set_previous(previous)
    previous.process()
    items1 = container.process()
    assert len(items1) == container.count()
    items2 = list(_get_items(container))
    assert all(items1) and all(items2)
    assert all(item.payload.get("text") for item in items1)
    assert all(item.payload.get("text") for item in items2)
    assert items1 == items2
    previous.process()
    items3 = container.process()
    items4 = list(_get_items(container))
    assert all(items3) and all(items4)
    assert all(item.payload.get("text") for item in items3)
    assert all(item.payload.get("text") for item in items4)
    assert items1 != items3
    assert items3 == items4
    assert not container.is_stopped() and not container.is_terminated()
    container.init_queue(manager.Queue)
    queue = container.out_queue
    for item in items4:
        queue.put(item)
    result = list(_get_items(container))
    for i, item in enumerate(items4):
        assert item.payload == result[i].payload
Beispiel #9
0
def test_source_container():
    manager = Manager()
    data = [DataItem() for _ in range(100)]
    for i, item in enumerate(data):
        item.set_metadata("id", i + 1)
    container = SourceContainer()
    assert not container.is_set()
    source = ListSource(data)
    container.set(source)
    assert not container.is_stopped()
    assert container.is_set()
    item = container.get_processed()
    assert item.get_metadata("id") == 1
    while not isinstance(item, Stop):
        assert not container.is_stopped()
        item = container.get_processed()
    assert container.is_stopped()
    container = SourceContainer()
    source = ListSource(data)
    container.set(source)
    item = DataItem()
    item.set_metadata("id", 1001)
    container.prepend_item(item)
    item = DataItem()
    item.set_metadata("id", 1002)
    container.prepend_item(item)
    assert container.get_processed().get_metadata("id") == 1001
    assert container.get_processed().get_metadata("id") == 1002
    assert container.get_processed().get_metadata("id") == 1
    assert container.get_processed().get_metadata("id") == 2
    item = DataItem()
    item.set_metadata("id", 1003)
    container.prepend_item(item)
    assert container.get_processed().get_metadata("id") == 1003
    assert container.get_processed().get_metadata("id") == 3
    assert not container.is_stopped()
    container.init_queue(manager.Queue)
    queue = container.out_queue
    item = DataItem()
    item.set_metadata("id", 1004)
    queue.put(item)
    assert container.get_processed().get_metadata("id") == 1004
    container.pop_into_queue()
    assert container.get_processed().get_metadata("id") == 4
Beispiel #10
0
def test_error():
    item = DataItem()
    stage = TextReverser()
    item.add_soft_error(stage.name, ValueError("value error"))
    item.add_soft_error(stage.name, KeyError("key error"))
    item.add_critical_error(stage.name, KeyError("key error"))
    assert item.has_critical_errors()
    assert item.has_errors()
    assert len(list(item.soft_errors())) == 2
    assert len(list(item.critical_errors())) == 1
    stage = TextReverser()
    item.add_soft_error(stage.name, SoftError())
    item.add_critical_error(stage.name, CriticalError())
    with pytest.raises(ValueError):
        item.add_soft_error(stage.name, CriticalError())
    with pytest.raises(ValueError):
        item.add_critical_error(stage.name, SoftError())
Beispiel #11
0
 def process(self, item: DataItem):
     if self._file is not None and self._file.name == __file__:
         item.payload["file"] = self._file.name
     return item
Beispiel #12
0
def test_data():
    item = DataItem()
    assert item.id
    item.payload["text"] = "prova"
    item.payload["id"] = "666"
    item.set_metadata("source", "remote")
    item.set_metadata("version", 3)
    assert item.id == "666"
    assert item.get_metadata("source") == "remote"
    assert item.get_metadata("version") == 3
    assert not item.get_metadata("head")
Beispiel #13
0
def test_stage_container():
    manager = Manager()
    simple_item = DataItem()
    simple_item.payload["text"] = "hello world"
    source = SourceContainer()
    source.set(ListSource([DataItem() for _ in range(20)]))
    previous = StageContainer("test0", TextGenerator(), ErrorManager())
    previous.set_previous(source)
    container = StageContainer("test1", TextReverser(), ErrorManager())
    container.set_previous(previous)
    previous.process()
    assert container.count() == 0
    item1 = container.process()
    item2 = container.get_processed()
    assert item1 and item2
    assert item1 == item2
    assert container.count() == 1
    previous.process()
    item3 = container.process()
    item4 = container.get_processed()
    assert container.count() == 2
    assert item3 and item4
    assert item1 != item3
    assert item3 == item4
    assert not container.is_stopped() and not container.is_terminated()
    container.init_queue(manager.Queue)
    queue = container.out_queue
    queue.put(item4)
    assert item4.payload == container.get_processed().payload
    source = SourceContainer()
    source.set(ListSource([simple_item]))
    container.set_previous(source)
    assert container.process()
    assert isinstance(container.process(), Stop)
    assert container.is_stopped() and not container.is_terminated()

    container = ConcurrentStageContainer(
        "test2",
        TextReverser(),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(previous)
    container.run()
    previous.process()
    item5 = container.get_processed(block=True)
    assert item5
    previous.process()
    item6 = container.get_processed(block=True)
    assert item6
    assert item5 != item6
    assert not container.is_stopped() and not container.is_terminated()
    queue = container.out_queue
    queue.put(item6)
    assert item6.payload == container.get_processed().payload
    container.terminate()
    container.shutdown()

    source = SourceContainer()
    source.set(ListSource([simple_item]))
    container = ConcurrentStageContainer(
        "test2",
        TextReverser(),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(source)
    container.run()
    source.pop_into_queue()
    assert container.get_processed(block=True)
    source.pop_into_queue()
    assert isinstance(container.get_processed(block=True), Stop)
    container.terminate()
    source.prepend_item(None)
    time.sleep(1)
    assert container.is_terminated()
    container.shutdown()
Beispiel #14
0
 def _generator():
     for i in itertools.count():
         item = DataItem()
         item.payload['id'] = i
         item.payload['text'] = random_text()
         yield item
Beispiel #15
0
def test_critical_errors(caplog):
    stage = TextReverser()
    manager = ErrorManager()
    item = DataItem()
    error = CriticalError()
    error.with_exception(Exception())
    managed_critical_error = manager.handle(error, stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert isinstance(
        next(item.critical_errors()).get_exception(),
        type(managed_critical_error.get_exception()),
    )
    assert any(caplog.records)
    manager = ErrorManager().raise_on_critical_error()
    item = DataItem()
    with pytest.raises(CriticalError):
        manager.handle(CriticalError(), stage, item)
    with pytest.raises(Exception):
        error = CriticalError().with_exception(Exception())
        manager.handle(error, stage, item)
    assert any(caplog.records)
    assert item.has_critical_errors()
    assert not item.has_errors()
    manager = ErrorManager().no_skip_on_critical_error()
    item = DataItem()
    assert manager.handle(CriticalError(), stage, item) is None
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert any(caplog.records)
    item = DataItem()
    manager.handle(ValueError(), stage, item)
    manager.handle(KeyError(), stage, item)
    manager.handle(KeyError(), stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert len(list(item.critical_errors())) == 3
    for record in caplog.records:
        assert "has generated an error" in record.message
Beispiel #16
0
def test_manager(caplog):
    manager = ErrorManager()
    stage = TextReverser()
    item = DataItem()
    manager.handle(SoftError(), stage, item)
    assert any(caplog.records)
    assert item.has_errors()
    assert not item.has_critical_errors()
    item = DataItem()
    manager.handle(CriticalError(), stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert any(caplog.records)
    item = DataItem()
    manager.handle(ValueError(), stage, item)
    manager.handle(KeyError(), stage, item)
    manager.handle(KeyError(), stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert len(list(item.critical_errors())) == 3
    for record in caplog.records:
        assert "has generated an error" in record.message
Beispiel #17
0
 def process(self, item: DataItem):
     for _ in range(self._cycles):
         item.payload["text"] = item.payload["text"][::-1]
     return item
Beispiel #18
0
 def process(self, item: DataItem):
     item.payload["text"] = random_text()
     return item