Example #1
0
class DefaultPipeline:
    """The biomedicus default pipeline for processing clinical documents.

    Attributes
        events_client (mtap.EventsClient): An MTAP events client used by the pipeline.
        pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents.

    """
    def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [
            (conf.sentences_id, conf.sentences_address),
            (conf.section_headers_id, conf.section_headers_address),
            (conf.tagger_id, conf.tagger_address),
            (conf.acronyms_id, conf.acronyms_address),
            (conf.concepts_id, conf.concepts_address),
            (conf.negation_id, conf.negation_address),
            (conf.selective_dependencies_id, conf.selective_dependencies_address),
            (conf.deepen_id, conf.deepen_address)
        ]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline]
            )
        else:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline]
            )
        if conf.serializer is not None:
            serialization_proc = SerializationProcessor(get_serializer(conf.serializer),
                                                        conf.output_directory,
                                                        include_label_text=conf.include_label_text)
            ser_comp = LocalProcessor(serialization_proc, component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)

    def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Example #2
0
class DefaultPipeline:
    """The biomedicus default pipeline for processing clinical documents.

    Attributes
        events_client (mtap.EventsClient): An MTAP events client used by the pipeline.
        pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents.

    """
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_address: Optional[str] = None,
                 events_client: EventsClient = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '':
            events_address = None
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        else:
            self.close_client = True
            self.events_client = EventsClient(address=events_address)

        self.pipeline = Pipeline.from_yaml_file(conf_path)

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)

    def process_text(self,
                     text: str,
                     *,
                     event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Example #3
0
class DefaultPipeline:
    def __init__(self,
                 conf: DefaultPipelineConf,
                 *,
                 events_client: EventsClient = None):
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.tagger_id, conf.tagger_address),
                    (conf.acronyms_id, conf.acronyms_address),
                    (conf.concepts_id, conf.concepts_address),
                    (conf.negation_id, conf.negation_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline],
                n_threads=conf.threads)
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ],
                                     n_threads=conf.threads)
        if conf.serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(conf.serializer),
                conf.output_directory,
                include_label_text=conf.include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)

    def process_text(self,
                     text: str,
                     *,
                     event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Example #4
0
class DefaultPipeline:
    """The biomedicus default pipeline for processing clinical documents.

    Attributes
        events_client (mtap.EventsClient): An MTAP events client used by the pipeline.
        pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents.

    """
    def __init__(self,
                 conf: PipelineConf,
                 *,
                 events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.tagger_id, conf.tagger_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline])
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ])

    def process_text(self,
                     text: str,
                     *,
                     event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Example #5
0
class Pipeline(MutableSequence['processing.ComponentDescriptor']):
    """An object which can be used to build and run a pipeline of remote and local processors.

    Pipelines are a :obj:`~typing.MutableSequence` containing
    one or more :obj:`~mtap.processing.pipeline.ComponentDescriptor`,
    a pipeline can be modified after creation using this functionality.

    Args:
        *components (ComponentDescriptor):
            A list of component descriptors created using :class:`RemoteProcessor` or
            :class:`LocalProcessor`.

    Keyword Args:
        name (~typing.Optional[str]): An optional name for the pipeline, defaults to 'pipeline'.
        config (~typing.Optional[Config]): An optional config override.

    Examples:
        Remote pipeline with name discovery:

        >>> with mtap.Events() as events, mtap.Pipeline(
        >>>         RemoteProcessor('processor-1-id'),
        >>>         RemoteProcessor('processor-2-id'),
        >>>         RemoteProcessor('processor-3-id')
        >>>     ) as pipeline:
        >>>     for txt in txts:
        >>>         with events.open_event() as event:
        >>>             document = event.add_document('plaintext', txt)
        >>>             results = pipeline.run(document)

        Remote pipeline using addresses:

        >>> with mtap.Events(address='localhost:50051') as events, mtap.Pipeline(
        >>>         RemoteProcessor('processor-1-name', address='localhost:50052'),
        >>>         RemoteProcessor('processor-2-id', address='localhost:50053'),
        >>>         RemoteProcessor('processor-3-id', address='localhost:50054')
        >>>     ) as pipeline:
        >>>     for txt in txts:
        >>>         event = events.open_event()
        >>>         document = event.add_document('plaintext', txt)
        >>>         results = pipeline.run(document)

        Modifying pipeline

        >>> pipeline = Pipeline(RemoteProcessor('foo', address='localhost:50000'),
                                RemoteProcessor('bar', address='localhost:50000'))
        >>> pipeline
        Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='bar', address='localhost:50000', component_id=None, params=None))
        >>> pipeline.append(RemoteProcessor('baz', address='localhost:50001'))
        >>> pipeline
        Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='bar', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='baz', address='localhost:50001', component_id=None, params=None))
        >>> del pipeline[1]
        >>> pipeline
        Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='baz', address='localhost:50001', component_id=None, params=None))
        >>> pipeline[1] = RemoteProcessor(processor_id='bar', address='localhost:50003')
        >>> pipeline
        Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='bar', address='localhost:50003', component_id=None, params=None))
        >>> pipeline += list(pipeline)  # Putting in a new list to prevent an infinite recursion
        >>> pipeline
        Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='bar', address='localhost:50003', component_id=None, params=None),
                 RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None),
                 RemoteProcessor(processor_id='bar', address='localhost:50003', component_id=None, params=None))

    Attributes:
        name (str): The pipeline's name.
    """
    __slots__ = [
        '_component_ids', 'name', '_component_descriptors', 'events_address',
        'mp_config', '_created_events_client', '_events_client', 'times_map',
        '__components'
    ]

    def __init__(self,
                 *components: 'processing.ComponentDescriptor',
                 name: Optional[str] = None,
                 events_address: Optional[str] = None,
                 events_client: Optional[EventsClient] = None,
                 mp_config: Optional[MpConfig] = None):
        self._component_ids = {}
        self.name = name or 'pipeline'
        self._component_descriptors = list(components)
        self.events_address = events_address
        self._created_events_client = False
        self._events_client = None
        if events_client is not None:
            self.events_client = events_client
        self.mp_config = mp_config or MpConfig()
        self.times_map = {}

    def __reduce__(self):
        return _create_pipeline, (self.name, self.events_address,
                                  self._events_client, self.mp_config) + tuple(
                                      self._component_descriptors)

    @staticmethod
    def from_yaml_file(conf_path: Union[pathlib.Path, str]) -> 'Pipeline':
        """Creates a pipeline from a yaml pipeline configuration file.

        Args:
            conf_path (str or pathlib.Path): The path to the configuration file.

        Returns:
            Pipeline object from the configuration.

        """
        conf_path = pathlib.Path(conf_path)
        from yaml import load
        try:
            from yaml import CLoader as Loader
        except ImportError:
            from yaml import Loader
        with conf_path.open('rb') as f:
            conf = load(f, Loader=Loader)
        return Pipeline.load_configuration(conf)

    @staticmethod
    def load_configuration(conf: Dict) -> 'Pipeline':
        """Creates a pipeline from a pipeline configuration dictionary.

        Args:
            conf (Dict): The pipeline configuration dictionary.

        Returns:
            Pipeline created from the configuration.

        """
        name = conf.get('name', None)
        events_address = conf.get('events_address', None) or conf.get(
            'events_addresses', None)
        components = []
        conf_components = conf.get('components', [])
        for conf_component in conf_components:
            components.append(
                RemoteProcessor(processor_id=conf_component['processor_id'],
                                address=conf_component['address'],
                                component_id=conf_component.get(
                                    'component_id', None),
                                params=dict(conf_component.get('params', {}))))
        mp_config = MpConfig.from_configuration(conf.get('mp_config', {}))
        return Pipeline(*components,
                        name=name,
                        events_address=events_address,
                        mp_config=mp_config)

    @property
    def events_client(self) -> EventsClient:
        if self._events_client is not None:
            return self._events_client
        self._created_events_client = True
        self._events_client = EventsClient(address=self.events_address)
        return self._events_client

    @events_client.setter
    def events_client(self, value: EventsClient):
        self._events_client = value

    @property
    def _components(self) -> 'List[processing.ProcessingComponent]':
        try:
            return self.__components
        except AttributeError:
            self.__components = [
                desc.create_pipeline_component(self._component_ids,
                                               lambda: self.events_client)
                for desc in self._component_descriptors
            ]
            return self.__components

    @_components.deleter
    def _components(self):
        for component in self.__components:
            component.close()
        del self.__components

    def run_multithread(self,
                        source: Union[Iterable[Union['mtap.Document',
                                                     'mtap.Event']],
                                      'processing.ProcessingSource'],
                        *,
                        params: Optional[Dict[str, Any]] = None,
                        show_progress: Optional[bool] = None,
                        total: Optional[int] = None,
                        close_events: Optional[bool] = None,
                        max_failures: Optional[int] = None,
                        workers: Optional[int] = None,
                        read_ahead: Optional[int] = None,
                        mp_context=None):
        """Runs this pipeline on a source which provides multiple documents / events.

        Concurrency is per-event, with each event being provided a thread which runs it through the
        pipeline.

        Args:
            source (~typing.Union[~typing.Iterable[~typing.Union[Event, Document]], ProcessingSource])
                A generator of events or documents to process. This should be an
                :obj:`~typing.Iterable` of either :obj:`Event` or :obj:`Document` objects or a
                :obj:`~mtap.processing.ProcessingSource`.
            params (~typing.Optional[dict[str, ~typing.Any]])
                Json object containing params specific to processing this event, the existing params
                dictionary defined in :func:`~PipelineBuilder.add_processor` will be updated with
                the contents of this dict.
            show_progress (~typing.Optional[bool])
                Whether to print a progress bar using tqdm.
            total (~typing.Optional[int])
                An optional argument indicating the total number of events / documents that will be
                provided by the iterable, for the progress bar.
            close_events (~typing.Optional[bool])
                Whether the pipeline should close events after they have been fully processed
                through all components.
            max_failures (~typing.Optional[int])
                The number of acceptable failures. Once this amount is exceeded processing will
                halt. Note that because of the nature of conccurrency processing may continue for a
                short amount of time before termination.
            workers (~typing.Optional[int])
                The number of threads to process documents on.
            read_ahead (~typing.Optional[int])
                The number of source documents to read ahead into memory before processing.
            mp_context (multiprocessing context, optional)
                An optional override for the multiprocessing context.

        Examples:
            >>> docs = list(Path('abc/').glob('*.txt'))
            >>> def document_source():
            >>>     for path in docs:
            >>>         with path.open('r') as f:
            >>>             txt = f.read()
            >>>         with Event(event_id=path.name, client=client) as event:
            >>>             doc = event.create_document('plaintext', txt)
            >>>             yield doc
            >>>
            >>> pipeline.run_multithread(document_source(), total=len(docs))

        """
        show_progress = show_progress if show_progress is not None else self.mp_config.show_progress
        close_events = close_events if close_events is not None else self.mp_config.close_events
        max_failures = max_failures if max_failures is not None else self.mp_config.max_failures
        workers = workers if workers is not None else self.mp_config.workers
        mp_context = (multiprocessing.get_context(
            self.mp_config.mp_start_method)
                      if mp_context is None else mp_context)
        read_ahead = read_ahead if read_ahead is not None else self.mp_config.read_ahead
        with _PipelineMultiRunner(self, source, params, show_progress, total,
                                  close_events, max_failures, workers,
                                  read_ahead, mp_context) as runner:
            runner.run()

    def run(
        self,
        target: Union['mtap.Event', 'mtap.Document'],
        *,
        params: Optional[Dict[str,
                              Any]] = None) -> 'processing.PipelineResult':
        """Processes the event/document using all of the processors in the pipeline.

        Args:
            target (~typing.Union[Event, Document]): Either an event or a document to process.
            params (dict[str, ~typing.Any]):
                Json object containing params specific to processing this event, the existing params
                dictionary defined in :func:`~PipelineBuilder.add_processor` will be updated with
                the contents of this dict.

        Returns:
            list[ProcessingResult]: The results of all the processors in the pipeline.

        Examples:
            >>> e = mtap.Event()
            >>> document = mtap.Document('plaintext', text="...", event=e)
            >>> with Pipeline(...) as pipeline:
            >>>     pipeline.run(document)
            >>>     # is equivalent to pipeline.run(document.event, params={'document_name': document.document_name})

            The 'document_name' param is used to indicate to :obj:`~mtap.DocumentProcessor`
            which document on the event to process.
        """
        event, params = _event_and_params(target, params)
        event_id = event.event_id

        result = self._run_by_event_id(event_id,
                                       event.event_service_instance_id, params)
        self._add_result_times(result)

        for component_result in result.component_results:
            try:
                event.add_created_indices(component_result.created_indices)
            except AttributeError:
                pass
        return result

    def _run_by_event_id(self, event_id, event_service_instance_id, params):
        start = datetime.now()
        results = [
            component.call_process(event_id, event_service_instance_id, params)
            for component in self._components
        ]
        total = datetime.now() - start
        results = [
            _base.ProcessingResult(identifier=component.component_id,
                                   result_dict=result[0],
                                   timing_info=result[1],
                                   created_indices=result[2])
            for component, result in zip(self._components, results)
        ]
        logger.debug('Finished processing event_id: %s', event_id)
        return _base.PipelineResult(results, total)

    def _add_result_times(self, result):
        times = {}
        for component_id, _, component_times, _ in result.component_results:
            times.update({
                component_id + ':' + k: v
                for k, v in component_times.items()
            })
        times[self.name + 'total'] = result.elapsed_time
        _timing.add_times(self.times_map, times)

    @overload
    def processor_timer_stats(self) -> 'List[processing.AggregateTimingInfo]':
        """Returns the timing information for all processors.

        Returns:
            List[AggregateTimingInfo]:
                A list of timing info objects, one for each processor, in the same order
                that the processors were added to the pipeline.
        """
        ...

    @overload
    def processor_timer_stats(
            self, identifier: str) -> 'processing.AggregateTimingInfo':
        """Returns the timing info for one processor.

        Args:
            identifier (Optional[str]): The pipeline component_id for the processor to return
                timing info.

        Returns:
            AggregateTimingInfo: The timing info for the specified processor.

        """
        ...

    def processor_timer_stats(self, identifier=None):
        if identifier is not None:
            aggregates = _timing.create_timer_stats(self.times_map,
                                                    identifier + ':')
            aggregates = {
                k[(len(identifier) + 1):]: v
                for k, v in aggregates.items()
            }
            return _base.AggregateTimingInfo(identifier=identifier,
                                             timing_info=aggregates)
        timing_infos = []
        for component in self._components:
            component_id = component.component_id
            aggregates = _timing.create_timer_stats(self.times_map,
                                                    component_id + ':')
            aggregates = {
                k[(len(component_id) + 1):]: v
                for k, v in aggregates.items()
            }
            timing_infos.append(
                _base.AggregateTimingInfo(identifier=component_id,
                                          timing_info=aggregates))

        return timing_infos

    def pipeline_timer_stats(self) -> 'processing.AggregateTimingInfo':
        """The aggregated statistics for the global runtime of the pipeline.

        Returns:
            AggregateTimingInfo: The timing stats for the global runtime of the pipeline.

        """
        pipeline_id = self.name
        aggregates = _timing.create_timer_stats(self.times_map, pipeline_id)
        aggregates = {k[len(pipeline_id):]: v for k, v in aggregates.items()}
        return _base.AggregateTimingInfo(identifier=self.name,
                                         timing_info=aggregates)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        """Closes any open connections to remote processors.
        """
        for component in self._components:
            try:
                component.close()
            except AttributeError:
                pass
        if self._created_events_client:
            self._events_client.close()

    def as_processor(self) -> 'processing.EventProcessor':
        """Returns the pipeline as a processor.

        Returns:
            EventProcessor: An event processor that can be added to other pipelines or hosted.
        """
        return _PipelineProcessor(self._components)

    def print_times(self):
        """Prints all of the times collected during this pipeline using :func:`print`.
        """
        self.pipeline_timer_stats().print_times()
        for pipeline_timer in self.processor_timer_stats():
            pipeline_timer.print_times()

    def __getitem__(self, item):
        return self._component_descriptors[item]

    def __setitem__(self, key, value):
        self._clear_components()
        self._component_descriptors[key] = value

    def __delitem__(self, key):
        self._clear_components()
        del self._component_descriptors[key]

    def __len__(self):
        return len(self._component_descriptors)

    def _clear_components(self):
        try:
            del self._components
        except AttributeError:
            pass

    def insert(self, index, o) -> None:
        self._clear_components()
        self._component_descriptors.insert(index, o)

    def __repr__(self):
        return "Pipeline(" + ', '.join(
            [repr(component)
             for component in self._component_descriptors]) + ')'