Ejemplo n.º 1
0
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_address: Optional[str] = None,
                 events_client: EventsClient = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '':
            events_address = None
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        else:
            self.close_client = True
            self.events_client = EventsClient(address=events_address)

        self.pipeline = Pipeline.from_yaml_file(conf_path)

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)
Ejemplo n.º 2
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--events")
    parser.add_argument("--tagger")
    parser.add_argument("--sentences")
    parser.add_argument("--acronyms")
    parser.add_argument("--norms")
    parser.add_argument("--concepts")
    args = parser.parse_args(args)

    input_dir = Path(args.input_directory)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('biomedicus-sentences', address=args.sentences),
            RemoteProcessor('biomedicus-tnt-tagger', address=args.tagger),
            RemoteProcessor('biomedicus-acronyms', address=args.acronyms),
            RemoteProcessor('biomedicus-concepts', address=args.concepts),
            LocalProcessor(SerializationProcessor(
                JsonSerializer, output_dir=args.output_directory),
                           component_id='serialize',
                           client=client)) as pipeline:
        for path in input_dir.glob("**/*.txt"):
            print("READING FILE:", str(path))
            with path.open('r') as f:
                contents = f.read()
            with Event(event_id=path.stem, client=client) as event:
                document = event.create_document("plaintext", text=contents)
                pipeline.run(document)

        pipeline.print_times()
Ejemplo n.º 3
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("concepts_csv", metavar="PATH_TO_CONCEPTS_CSV")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--sentences")
    parser.add_argument("--tagger")
    parser.add_argument("--acronyms")
    parser.add_argument("--events")

    ns = parser.parse_args(args)

    print('Reading concepts csv...')
    concepts = {}
    with open(ns.concepts_csv, 'r') as f:
        for line in f.readlines():
            splits = line.split(',')
            end = splits[0]
            start = splits[1]
            cui = splits[5]
            identifier = splits[6]
            try:
                v = concepts[identifier]
            except KeyError:
                v = []
                concepts[identifier] = v
            v.append((start, end, cui))

    print('Reading mipacq source files...')
    with EventsClient(address=ns.events) as client, \
            Pipeline(
                RemoteProcessor('biomedicus-sentences', address=ns.sentences),
                RemoteProcessor('biomedicus-tnt-tagger', address=ns.tagger),
                RemoteProcessor('biomedicus-acronyms', address=ns.acronyms),
                LocalProcessor(SerializationProcessor(PickleSerializer,
                                                      output_dir=ns.output_directory),
                               component_id='serialize',
                               client=client)
            ) as pipeline:
        for path in Path(ns.input_directory).glob('**/*.source'):
            identifier = path.stem.split('-')[0]
            try:
                doc_concepts = concepts[identifier]
            except KeyError:
                continue
            with Event(event_id=identifier, client=client) as event:
                with path.open('r') as f:
                    text = f.read()
                document = event.create_document('plaintext', text)
                with document.get_labeler('gold_concepts') as label_concept:
                    for start, end, cui in doc_concepts:
                        label_concept(start, end, cui=cui)
                pipeline.run(document)
Ejemplo n.º 4
0
def run_themes_pipeline(input_directory, annotations_directory,
                        output_directory):
    events_address = 'localhost:50100'
    with Pipeline(RemoteProcessor('biomedicus-sentences',
                                  address='localhost:50300'),
                  LocalProcessor(
                      AttachPalliativeThemesProcessor(annotations_directory)),
                  LocalProcessor(CoalescePalliativeThemesProcessor(), ),
                  LocalProcessor(
                      SerializationProcessor(JsonSerializer,
                                             output_directory)),
                  events_address=events_address) as pipeline:
        source = FilesInDirectoryProcessingSource(pipeline.events_client,
                                                  input_directory)
        pipeline.run_multithread(source, workers=8)
Ejemplo n.º 5
0
def run_pipeline(conf):
    pipeline_conf = conf.pipeline_config or Path(
        __file__).parent / 'examplePipelineConfiguration.yml'
    pipeline = Pipeline.from_yaml_file(pipeline_conf)

    with mtap.EventsClient(address=conf.events_address) as client:
        pipeline.append(
            LocalProcessor(proc=SerializationProcessor(
                ser=JsonSerializer, output_dir=conf.output_directory),
                           client=client,
                           component_id='serialization_processor'))
        source = FilesInDirectoryProcessingSource(
            directory=conf.input_directory, client=client)
        pipeline.run_multithread(source=source,
                                 workers=conf.threads,
                                 max_failures=conf.max_failures,
                                 read_ahead=conf.read_ahead)
Ejemplo n.º 6
0
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_addresses: Optional[str] = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        self.pipeline = Pipeline.from_yaml_file(conf_path)
        if events_addresses is not None:
            self.pipeline.events_address = events_addresses

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer')
            self.pipeline.append(ser_comp)
Ejemplo n.º 7
0
    def __init__(self,
                 conf: PipelineConf,
                 *,
                 events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.section_headers_id, conf.section_headers_address),
                    (conf.tagger_id, conf.tagger_address),
                    (conf.acronyms_id, conf.acronyms_address),
                    (conf.concepts_id, conf.concepts_address),
                    (conf.negation_id, conf.negation_address),
                    (conf.selective_dependencies_id,
                     conf.selective_dependencies_address),
                    (conf.deepen_id, conf.deepen_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline])
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ])
        if conf.serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(conf.serializer),
                conf.output_directory,
                include_label_text=conf.include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)
Ejemplo n.º 8
0
def main(args=None):
    parser = ArgumentParser(
        description=
        'Converts files from the i2b2/VA 2010 format to serialized MTAP events '
        'containing the ')
    parser.add_argument(
        'input_directory',
        type=Path,
        help=
        'An input directory containing a "txt" folder containing text files '
        'and an "ast" folder containing the assertions in the i2b2/VA '
        'pipe-delimited format.')
    parser.add_argument(
        'output_directory',
        type=Path,
        help='An output directory to write the serialized mtap events to.')
    parser.add_argument('--target-document', default='plaintext')
    parser.add_argument('--serializer',
                        default='pickle',
                        choices=standard_serializers.keys(),
                        help='The serializer to use.')
    parser.add_argument('--events', help="Address of the events client.")
    parser.add_argument('--tagger', help="Address of the pos tagger to use.")

    conf = parser.parse_args(args)

    serializer = standard_serializers[conf.serializer]

    with EventsClient(address=conf.events) as client, Pipeline(
            LocalProcessor(OnePerLineSentencesProcessor(),
                           component_id='sentences',
                           client=client),
            RemoteProcessor('biomedicus-tnt-tagger', address=conf.tagger),
            LocalProcessor(SerializationProcessor(
                serializer, output_dir=conf.output_directory),
                           component_id='serializer',
                           client=client)) as pipeline:
        results = pipeline.run_multithread(
            events(conf.input_directory, conf.target_document, client=client))
        pipeline.print_times()
Ejemplo n.º 9
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--events")
    parser.add_argument("--rtf")
    parser.add_argument("--tagger")
    parser.add_argument("--acronyms")
    parser.add_argument("--sentences")
    args = parser.parse_args(args)

    input_dir = Path(args.input_directory)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('rtf-processor',
                            address=args.rtf,
                            params={
                                'binary_data_name': 'rtf',
                                'output_document_name': 'plaintext'
                            }),
            RemoteProcessor('sentences',
                            address=args.sentences,
                            params={'document_name': 'plaintext'}),
            RemoteProcessor('tnt-tagger',
                            address=args.tagger,
                            params={'document_name': 'plaintext'}),
            RemoteProcessor('acronyms', address=args.acronyms),
            LocalProcessor(SerializationProcessor(
                JsonSerializer, output_dir=args.output_directory),
                           component_id='serialize',
                           client=client)) as pipeline:
        for path in input_dir.glob("**/*.rtf"):
            with path.open('rb') as f:
                contents = f.read()
            with Event(event_id=path.stem, client=client) as event:
                event.binaries['rtf'] = contents
                pipeline.run(event)

        pipeline.print_times()
Ejemplo n.º 10
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_DIR',
                        help='A folder containing PTB formatted documents.')
    parser.add_argument('output',
                        metavar='OUTPUT_DIR',
                        help='A folder to write the json files to.')
    parser.add_argument('--glob', metavar='GLOB', default='*.mrg')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--ptb-reader',
                        metavar='READER',
                        default=None,
                        help='The address of the PTB Reader.')
    args = parser.parse_args(args)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('ptb-reader',
                            address=args.ptb_reader,
                            params={
                                'source_document_name': 'source',
                                'target_document_name': 'gold',
                                'pos_tags_index': 'gold_tags'
                            }),
            LocalProcessor(SerializationProcessor(JsonSerializer,
                                                  output_dir=args.output),
                           component_id='serializer',
                           client=client)) as pipeline:
        for f in Path(args.input).rglob(args.glob):
            print('Reading:', f)
            with f.open('r') as r:
                text = r.read()
            with Event(event_id=f.name, client=client) as event:
                d = Document('source', text=text)
                event.add_document(d)
                pipeline.run(event)
Ejemplo n.º 11
0
def run_sentences_pipeline(input_directory, skip_file, output_directory):
    skip_documents = set(Path(skip_file).open('r').read().splitlines())
    events_address = 'localhost:50100'
    with Pipeline(RemoteProcessor('biomedicus-sentences',
                                  address='localhost:50300'),
                  LocalProcessor(
                      SerializationProcessor(JsonSerializer,
                                             output_directory)),
                  events_address=events_address) as pipeline:
        total = sum(1 for _ in input_directory.rglob('*.txt'))

        def source():
            for path in input_directory.rglob('*.txt'):
                relative = str(path.relative_to(input_directory))
                if relative not in skip_documents:
                    with path.open('r') as f:
                        txt = f.read()
                    with Event(event_id=relative,
                               client=pipeline.events_client,
                               only_create_new=True) as e:
                        doc = e.create_document('plaintext', txt)
                        yield doc

        pipeline.run_multithread(source(), total=total, workers=8)
Ejemplo n.º 12
0
def run_serializer_processor(args):
    ser = _serializers[args.serializer]
    proc = SerializationProcessor(ser, args.output_dir)
    run_processor(proc, args=args)