Beispiel #1
0
    def dereference_pack(self, pack_id: int):
        """
        This method reduce the count the data pack or multi pack, when the count
        reaches 0, the pack will be released.

        Must remember to de-reference a pack after processing, otherwise
        we will have memory issues.

        Args:
            pack_id: The pack id that points to the pack to be de-referenced.

        Returns:

        """
        if pack_id not in self.pack_references:
            # This can happen when the instance is reset by the pipeline.
            return

        if self.pack_references[pack_id] < 0:
            # I am not sure if there are cases that can deduct the reference
            # count too much, but we'd put a check here just in case.
            raise ProcessFlowException(
                f"Pack reference count for pack [{pack_id}] is only "
                f"{self.pack_references[pack_id]},"
                f" which is invalid.")

        # Reduce the reference count.
        self.pack_references[pack_id] -= 1

        # If the reference count reaches 0, then we can remove the pack from
        # the pool and allow Python to garbage collect it.
        if self.pack_references[pack_id] == 0:
            self.pack_pool.pop(pack_id)
Beispiel #2
0
    def process_one(self, *args, **kwargs) -> PackType:
        r"""Process one single data pack. This is done by only reading and
        processing the first pack in the reader.

        Args:
            kwargs: the information needed to load the data. For example, if
                :attr:`_reader` is :class:`StringReader`, this should contain a
                single piece of text in the form of a string variable. If
                :attr:`_reader` is a file reader, this can point to the file
                path.
        """
        if not self.initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        first_pack = []

        for p in self._reader.iter(*args, **kwargs):
            first_pack.append(p)
            break

        if len(first_pack) == 1:
            results = list(self._process_packs(iter(first_pack)))
            return results[0]
        else:
            raise ValueError("Input data source contains no packs.")
Beispiel #3
0
    def process_dataset(self, *args, **kwargs) -> Iterator[PackType]:
        r"""Process the documents in the data source(s) and return an
        iterator or list of DataPacks. The arguments are directly passed
        to the reader to take data from the source.
        """
        if not self.initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        data_iter = self._reader.iter(*args, **kwargs)
        return self._process_packs(data_iter)
Beispiel #4
0
    def add(self, component: PipelineComponent,
            config: Optional[Union[Config, Dict[str, Any]]] = None,
            selector: Optional[Selector] = None):
        self._processors_index[component.name] = len(self.components)

        if isinstance(component, BaseReader):
            raise ProcessFlowException("Reader need to be set via set_reader()")

        if isinstance(component, Evaluator):
            # This will ask the job to keep a copy of the gold standard.
            self.evaluator_indices.append(len(self.components))

        component.assign_manager(self._proc_mgr, self._pack_manager)
        self._components.append(component)
        self.processor_configs.append(component.make_configs(config))

        if selector is None:
            self._selectors.append(DummySelector())
        else:
            self._selectors.append(selector)
Beispiel #5
0
    def set_remapped_pack_id(self, pack: ContainerType):
        """
        Give a new id to the pack and remember the remap.

        Args:
            pack:

        Returns:

        """
        # The pack should already have a valid pack id.
        assert get_pack_id(pack) >= 0

        pid = get_pack_id(pack)

        # Record this remapping, and assign a new id to the pack.
        if pid in self.remap:
            raise ProcessFlowException(f"The pack id {pid} "
                                       f"has already been remapped.")

        self.remap[pid] = self.next_id
        pack.meta.pack_id = self.next_id  # type: ignore
        self.next_id += 1
Beispiel #6
0
    def _process_packs(self,
                       data_iter: Iterator[PackType]) -> Iterator[PackType]:
        r"""Process the packs received from the reader by the running through
        the pipeline.

        Args:
             data_iter (iterator): Iterator yielding jobs that contain packs

        Returns:
            Yields packs that are processed by the pipeline.
        """

        # pylint: disable=line-too-long

        # Here is the logic for the execution of the pipeline.

        # The basic idea is to yield a pack as soon as it gets processed by all
        # the processors instead of waiting for later jobs to get processed.

        # 1) A job can be in three status
        #  - UNPROCESSED
        #  - QUEUED
        #  - PROCESSED
        #
        # 2) Each processor maintains a queue to hold the jobs to be executed
        # next.
        #
        # 3) In case of a BatchProcessor, a job enters into QUEUED status if the
        # job does not satisfy the `batch_size` requirement of that processor.
        # In that case, the pipeline requests for additional jobs from the
        # reader and starts the execution loop from the beginning.
        #
        # 4) At any point, while moving to the next processor, the pipeline
        # ensures that all jobs are either in QUEUED or PROCESSED status. If
        # they are PROCESSED, they will be moved to the next queue. This design
        # ensures that at any point, while processing the job at processor `i`,
        # all the jobs in the previous queues are in QUEUED status. So whenever
        # a new job is needed, the pipeline can directly request it from the
        # reader instead of looking at previous queues for UNPROCESSED jobs.
        #
        # 5) When a processor receives a poison pack, it flushes all the
        # remaining batches in its memory (this actually has no effect in
        # PackProcessors) and moves the jobs including the poison pack to the
        # next queue. If there is no next processor, the packs are yield.
        #
        # 6) The loop terminates when the last queue contains only a poison pack
        #
        # Here is the sample pipeline and its execution
        #
        # Assume 1 pack corresponds to a batch of size 1
        #
        # After 1st step (iteration), reading from the reader,
        #
        #            batch_size = 2                               batch_size = 2
        #  Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|
        #          |___________|
        #          |___________|
        #          |___________|
        #          |_J1:QUEUED_|
        #
        # B1 needs another pack to process job J1
        #
        # After 2nd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_J2:UNPROCESSED_|
        #          |___________|       |_J1:UNPROCESSED_|
        #
        # B1 processes both the packs, the jobs are moved to the next queue.
        #
        # After 3rd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_J2:UNPROCESSED_|     |_J1:UNPROCESSED_|
        #
        # P1 processes the first job. However, there exists one UNPROCESSED job
        # J2 in the queue. Pipeline first processes this job before moving to the
        # next processor
        #
        # After 4th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_J2:UNPROCESSED_|
        #        |___________|       |_______________|     |_J1:UNPROCESSED_|
        #
        #
        # After 5th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|    --> Yield J1.pack and J2.pack
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|

        if not self.initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        buffer = ProcessBuffer(self, data_iter)

        if len(self.components) == 0:
            yield from data_iter
            # Write return here instead of using if..else to reduce indent.
            return

        while not self.proc_mgr.exhausted():
            # job has to be the first UNPROCESSED element
            # the status of the job now is UNPROCESSED
            unprocessed_job: ProcessJob = next(buffer)

            processor_index = self.proc_mgr.current_processor_index
            processor = self.components[processor_index]
            selector = self._selectors[processor_index]
            current_queue_index = self.proc_mgr.current_queue_index
            current_queue = self.proc_mgr.current_queue
            pipeline_length = self.proc_mgr.pipeline_length
            unprocessed_queue_indices = \
                self.proc_mgr.unprocessed_queue_indices
            processed_queue_indices = \
                self.proc_mgr.processed_queue_indices
            next_queue_index = current_queue_index + 1
            should_yield = next_queue_index >= pipeline_length

            if not unprocessed_job.is_poison:
                for pack in selector.select(unprocessed_job.pack):
                    # First, perform the component action on the pack
                    try:
                        if isinstance(processor, Caster):
                            # Replacing the job pack with the casted version.
                            unprocessed_job.alter_pack(processor.cast(pack))
                        elif isinstance(processor, BaseProcessor):
                            processor.process(pack)
                        elif isinstance(processor, Evaluator):
                            processor.consume_next(
                                pack,
                                self._predict_to_gold[unprocessed_job.id])

                        # After the component action, make sure the entry is
                        # added into the index.
                        pack.add_all_remaining_entries()
                    except Exception as e:
                        raise ProcessExecutionException(
                            f'Exception occurred when running '
                            f'{processor.name}') from e

                    # Then, based on component type, handle the queue.
                    if isinstance(processor, BaseBatchProcessor):
                        index = unprocessed_queue_indices[current_queue_index]

                        # check status of all the jobs up to "index"
                        for i, job_i in enumerate(
                                itertools.islice(current_queue, 0, index + 1)):

                            if job_i.status == ProcessJobStatus.PROCESSED:
                                processed_queue_indices[
                                    current_queue_index] = i

                        # there are UNPROCESSED jobs in the queue
                        if index < len(current_queue) - 1:
                            unprocessed_queue_indices[current_queue_index] \
                                += 1

                        # Fetch more data from the reader to process the
                        # first job
                        elif (processed_queue_indices[current_queue_index] ==
                              -1):

                            unprocessed_queue_indices[current_queue_index] \
                                = len(current_queue)

                            self.proc_mgr.current_processor_index = 0

                            self.proc_mgr.current_queue_index = -1

                        else:
                            processed_queue_index = \
                                processed_queue_indices[current_queue_index]

                            # move or yield the pack
                            c_queue = list(current_queue)
                            for job_i in \
                                    c_queue[:processed_queue_index + 1]:

                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    yield job_i.pack
                                else:
                                    self.proc_mgr.add_to_queue(
                                        queue_index=next_queue_index,
                                        job=job_i)
                                current_queue.popleft()

                            # set the UNPROCESSED and PROCESSED indices
                            unprocessed_queue_indices[current_queue_index] \
                                = len(current_queue)

                            processed_queue_indices[current_queue_index] \
                                = -1

                            if should_yield:
                                self.proc_mgr.current_processor_index = 0
                                self.proc_mgr.current_queue_index = -1
                            else:
                                self.proc_mgr.current_processor_index \
                                    = next_queue_index
                                self.proc_mgr.current_queue_index \
                                    = next_queue_index

                    # Besides Batch Processors, the other component type only
                    # deal with one pack at a time, these include: PackProcessor
                    # Evaluator, Caster.
                    # - Move them to the next queue
                    else:
                        index = unprocessed_queue_indices[current_queue_index]

                        # there are UNPROCESSED jobs in the queue
                        if index < len(current_queue) - 1:
                            unprocessed_queue_indices[current_queue_index] \
                                += 1
                        else:
                            # current_queue is modified in this array
                            for job_i in list(current_queue):
                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    yield job_i.pack
                                else:
                                    self.proc_mgr.add_to_queue(
                                        queue_index=next_queue_index,
                                        job=job_i)
                                current_queue.popleft()

                            # set the UNPROCESSED index
                            # we do not use "processed_queue_indices" as the
                            # jobs get PROCESSED whenever they are passed
                            # into a PackProcessor
                            unprocessed_queue_indices[current_queue_index] \
                                = len(current_queue)

                            # update the current queue and processor only
                            # when all the jobs are processed in the current
                            # queue
                            if should_yield:
                                self.proc_mgr.current_processor_index = 0
                                self.proc_mgr.current_queue_index = -1

                            else:
                                self.proc_mgr.current_processor_index \
                                    = next_queue_index
                                self.proc_mgr.current_queue_index \
                                    = next_queue_index
            else:
                processor.flush()

                # current queue is modified in the loop
                for job in list(current_queue):
                    if job.status != ProcessJobStatus.PROCESSED and \
                            not job.is_poison:
                        raise ValueError("Job is neither PROCESSED nor is "
                                         "a poison. Something went wrong "
                                         "during execution.")

                    if not job.is_poison and should_yield:
                        if job.id in self._predict_to_gold:
                            self._predict_to_gold.pop(job.id)
                        yield job.pack

                    elif not should_yield:
                        self.proc_mgr.add_to_queue(
                            queue_index=next_queue_index, job=job)

                    if not job.is_poison:
                        current_queue.popleft()

                if not should_yield:
                    # set next processor and queue as current
                    self.proc_mgr.current_processor_index = next_queue_index
                    self.proc_mgr.current_queue_index = next_queue_index
Beispiel #7
0
    def _process_packs(
            self, data_iter: Iterator[PackType]) -> Iterator[PackType]:
        r"""Process the packs received from the reader by the running through
        the pipeline.

        Args:
             data_iter (iterator): Iterator yielding jobs that contain packs

        Returns:
            Yields packs that are processed by the pipeline.
        """

        # pylint: disable=line-too-long

        # Here is the logic for the execution of the pipeline.

        # The basic idea is to yield a pack as soon as it gets processed by all
        # the processors instead of waiting for later jobs to get processed.

        # 1) A job can be in three status
        #  - UNPROCESSED
        #  - QUEUED
        #  - PROCESSED
        #
        # 2) Each processor maintains a queue to hold the jobs to be executed
        # next.
        #
        # 3) In case of a BatchProcessor, a job enters into QUEUED status if the
        # batch is not full according to the batcher of that processor.
        # In that case, the pipeline requests for additional jobs from the
        # reader and starts the execution loop from the beginning.
        #
        # 4) At any point, while moving to the next processor, the pipeline
        # ensures that all jobs are either in QUEUED or PROCESSED status. If
        # they are PROCESSED, they will be moved to the next queue. This design
        # ensures that at any point, while processing the job at processor `i`,
        # all the jobs in the previous queues are in QUEUED status. So whenever
        # a new job is needed, the pipeline can directly request it from the
        # reader instead of looking at previous queues for UNPROCESSED jobs.
        #
        # 5) When a processor receives a poison pack, it flushes all the
        # remaining batches in its memory (this actually has no effect in
        # PackProcessors) and moves the jobs including the poison pack to the
        # next queue. If there is no next processor, the packs are yield.
        #
        # 6) The loop terminates when the last queue contains only a poison pack
        #
        # Here is the sample pipeline and its execution
        #
        # Assume 1 pack corresponds to a batch of size 1
        #
        # After 1st step (iteration), reading from the reader,
        #
        #            batch_size = 2                               batch_size = 2
        #  Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|
        #          |___________|
        #          |___________|
        #          |___________|
        #          |_J1:QUEUED_|
        #
        # B1 needs another pack to process job J1
        #
        # After 2nd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_J2:UNPROCESSED_|
        #          |___________|       |_J1:UNPROCESSED_|
        #
        # B1 processes both the packs, the jobs are moved to the next queue.
        #
        # After 3rd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_J2:UNPROCESSED_|     |_J1:UNPROCESSED_|
        #
        # P1 processes the first job. However, there exists one UNPROCESSED job
        # J2 in the queue. Pipeline first processes this job before moving to the
        # next processor
        #
        # After 4th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_J2:UNPROCESSED_|
        #        |___________|       |_______________|     |_J1:UNPROCESSED_|
        #
        #
        # After 5th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|    --> Yield J1.pack and J2.pack
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|

        if not self.initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        buffer = ProcessBuffer(self, data_iter)

        if len(self.components) == 0:
            yield from data_iter
            # Write return here instead of using if..else to reduce indent.
            return

        while not self._proc_mgr.exhausted():
            # Take the raw job from the buffer, the job status now should
            # be UNPROCESSED.
            raw_job: ProcessJob = next(buffer)

            component_index = self._proc_mgr.current_processor_index
            component = self.components[component_index]
            selector: Selector = self._selectors[component_index]
            current_queue_index = self._proc_mgr.current_queue_index
            current_queue: Deque[ProcessJob] = self._proc_mgr.current_queue
            pipeline_length = self._proc_mgr.pipeline_length
            unprocessed_queue_indices = self._proc_mgr.unprocessed_queue_indices
            processed_queue_indices = self._proc_mgr.processed_queue_indices
            next_queue_index = current_queue_index + 1
            should_yield = next_queue_index >= pipeline_length

            if not raw_job.is_poison:
                self._process_with_component(selector, component, raw_job)

                # Then, based on component type, handle the queue.
                if isinstance(component, BaseBatchProcessor):
                    self.__update_batch_job_status(component)
                    index = unprocessed_queue_indices[current_queue_index]

                    # Check status of all the jobs up to "index".
                    for i, job_i in enumerate(
                            itertools.islice(current_queue, 0, index + 1)):
                        if job_i.status == ProcessJobStatus.PROCESSED:
                            processed_queue_indices[current_queue_index] = i

                    # There are UNPROCESSED jobs in the queue.
                    if index < len(current_queue) - 1:
                        unprocessed_queue_indices[current_queue_index] += 1
                    elif processed_queue_indices[current_queue_index] == -1:
                        # Fetch more data from the reader to process the
                        # first job.
                        unprocessed_queue_indices[
                            current_queue_index] = len(current_queue)
                        self._proc_mgr.current_processor_index = 0
                        self._proc_mgr.current_queue_index = -1
                    else:
                        processed_queue_index = processed_queue_indices[
                            current_queue_index]
                        # Move or yield the pack.
                        c_queue = list(current_queue)
                        for job_i in c_queue[:processed_queue_index + 1]:
                            if job_i.status == ProcessJobStatus.PROCESSED:
                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    # TODO: I don't know why these are
                                    #  marked as incompatible type by mypy.
                                    #  the same happens 3 times on every yield.
                                    #  It is observed that the pack returned
                                    #  from the `ProcessJob` is considered to
                                    #  be different from `PackType`.
                                    yield job_i.pack  # type: ignore
                                else:
                                    self._proc_mgr.add_to_queue(
                                        queue_index=next_queue_index, job=job_i)
                            else:
                                raise ProcessFlowException(
                                    f"The job status should be "
                                    f"{ProcessJobStatus.PROCESSED} "
                                    f"at this point.")
                            current_queue.popleft()

                        # Set the UNPROCESSED and PROCESSED indices.
                        unprocessed_queue_indices[
                            current_queue_index] = len(current_queue)

                        processed_queue_indices[current_queue_index] = -1

                        if should_yield:
                            self._proc_mgr.current_processor_index = 0
                            self._proc_mgr.current_queue_index = -1
                        else:
                            self._proc_mgr.current_processor_index \
                                = next_queue_index
                            self._proc_mgr.current_queue_index \
                                = next_queue_index
                # Besides Batch Processors, the other component type only
                # deal with one pack at a time, these include: PackProcessor
                # Evaluator, Caster.
                # - Move them to the next queue
                else:
                    self.__update_stream_job_status()
                    index = unprocessed_queue_indices[current_queue_index]

                    # there are UNPROCESSED jobs in the queue
                    if index < len(current_queue) - 1:
                        unprocessed_queue_indices[current_queue_index] += 1
                    else:
                        # current_queue is modified in this array
                        for job_i in list(current_queue):
                            if job_i.status == ProcessJobStatus.PROCESSED:
                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    yield job_i.pack  # type: ignore
                                else:
                                    self._proc_mgr.add_to_queue(
                                        queue_index=next_queue_index, job=job_i)
                                current_queue.popleft()
                            else:
                                raise ProcessFlowException(
                                    f"The job status should be "
                                    f"{ProcessJobStatus.PROCESSED} "
                                    f"at this point.")

                        # set the UNPROCESSED index
                        # we do not use "processed_queue_indices" as the
                        # jobs get PROCESSED whenever they are passed
                        # into a PackProcessor
                        unprocessed_queue_indices[current_queue_index] \
                            = len(current_queue)

                        # update the current queue and processor only
                        # when all the jobs are processed in the current
                        # queue
                        if should_yield:
                            self._proc_mgr.current_processor_index = 0
                            self._proc_mgr.current_queue_index = -1

                        else:
                            self._proc_mgr.current_processor_index \
                                = next_queue_index
                            self._proc_mgr.current_queue_index \
                                = next_queue_index
            else:
                component.flush()
                self.__flush_batch_job_status()

                # current queue is modified in the loop
                for job in list(current_queue):
                    if job.status != ProcessJobStatus.PROCESSED and \
                            not job.is_poison:
                        raise ValueError("Job is neither PROCESSED nor is "
                                         "a poison. Something went wrong "
                                         "during execution.")

                    if not job.is_poison and should_yield:
                        if job.id in self._predict_to_gold:
                            self._predict_to_gold.pop(job.id)
                        yield job.pack  # type: ignore

                    elif not should_yield:
                        self._proc_mgr.add_to_queue(
                            queue_index=next_queue_index, job=job)

                    if not job.is_poison:
                        current_queue.popleft()

                if not should_yield:
                    # set next processor and queue as current
                    self._proc_mgr.current_processor_index = next_queue_index
                    self._proc_mgr.current_queue_index = next_queue_index

        self._proc_mgr.reset()
Beispiel #8
0
    def add(
        self,
        component: PipelineComponent,
        config: Optional[Union[Config, Dict[str, Any]]] = None,
        selector: Optional[Selector] = None,
    ) -> "Pipeline":
        """
        Adds a pipeline component to the pipeline. The pipeline components
        will form a chain based on the insertion order. The customized
        `config` and `selector` (:class:`~forte.data.selector.Selector`)
        will be associated with this particular component. If the `config`
        or the `selector` is not provided, the default ones will be used.

        Here, note that the same component instance can be added multiple
        times to the pipeline. In such cases, the instance will only be
        setup at the first insertion (i.e. its `initialize` function will
        only be called once). The subsequent insertion of the same component
        instance will not change the behavior nor the states of the instance.
        Thus, a different `config` cannot be provided (should be `None`) when
        added the second time, otherwise a `ProcessorConfigError` will be
        thrown. In the case where one want to them to behave differently, a
        different instance should be used.

        Args:
            component (PipelineComponent): The component to be inserted next
              to the pipeline.
            config (Union[Config, Dict[str, Any]): The custom configuration
              to be used for the added component. Default None, which means
              the `default_configs()` of the component will be used.
            selector (Selector): The selector used to pick the corresponding
              data pack to be consumed by the component. Default None, which
              means the whole pack will be used.

        Returns:
            The pipeline itself, which enables one to chain the creation of
            the pipeline, i.e., you can do:

            .. code-block:: python

                Pipeline().set_reader(your_reader()).add(
                    your_processor()).add(anther_processor())
        """
        if isinstance(component, BaseReader):
            raise ProcessFlowException(
                "Reader need to be set via set_reader()")

        if isinstance(component, Evaluator):
            # This will ask the job to keep a copy of the gold standard.
            self.evaluator_indices.append(len(self.components))

        if component not in self.__component_set:
            # The case where the component is not found.
            self._components.append(component)
            self.__component_set.add(component)
            self.component_configs.append(component.make_configs(config))
        else:
            if config is None:
                self._components.append(component)
                # We insert a `None` value here just to make the config list
                # to match the component list, but this config should not be
                # used.
                self.component_configs.append(None)
            else:
                raise ProcessorConfigError(
                    f"The same instance of a component named {component.name} "
                    f" has already been added to"
                    f" the pipeline, we do not accept a different configuration"
                    f" for it. If you would like to use a differently"
                    f" configured component, please create another instance."
                    f" If you intend to re-use the component instance, please"
                    f" do not provide the `config` (or provide a `None`).")

        if selector is None:
            self._selectors.append(self.__default_selector)
        else:
            self._selectors.append(selector)

        return self
Beispiel #9
0
    def get_input_source(self) -> str:
        if self.instance().initial_reader is None:
            raise ProcessFlowException("Input source is not set.")

        return self.instance().initial_reader
Beispiel #10
0
    def instance(self):  # I don't know how to specify type __PackManager.
        if self.__instance is None:
            raise ProcessFlowException("The pack manager is not initialized.")

        return self.__instance