def __init__(self, conf_path: Union[str, Path], output_directory: Union[str, Path], *, events_address: Optional[str] = None, events_client: EventsClient = None, serializer: Optional[str] = None, include_label_text: bool = False): if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '': events_address = None if events_client is not None: self.close_client = False self.events_client = events_client else: self.close_client = True self.events_client = EventsClient(address=events_address) self.pipeline = Pipeline.from_yaml_file(conf_path) if serializer == 'None': serializer = None if serializer is not None: serialization_proc = SerializationProcessor( get_serializer(serializer), output_directory, include_label_text=include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp)
class DefaultPipeline: """The biomedicus default pipeline for processing clinical documents. Attributes events_client (mtap.EventsClient): An MTAP events client used by the pipeline. pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents. """ def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None): conf.populate_addresses() if events_client is not None: self.close_client = False self.events_client = events_client elif conf.events_address is not None: self.close_client = True self.events_client = EventsClient(address=conf.events_address) else: raise ValueError("Events client or address not specified.") pipeline = [ (conf.sentences_id, conf.sentences_address), (conf.section_headers_id, conf.section_headers_address), (conf.tagger_id, conf.tagger_address), (conf.acronyms_id, conf.acronyms_address), (conf.concepts_id, conf.concepts_address), (conf.negation_id, conf.negation_address), (conf.selective_dependencies_id, conf.selective_dependencies_address), (conf.deepen_id, conf.deepen_address) ] if conf.use_discovery: self.pipeline = Pipeline( *[RemoteProcessor(identifier) for identifier, _ in pipeline] ) else: self.pipeline = Pipeline( *[RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline] ) if conf.serializer is not None: serialization_proc = SerializationProcessor(get_serializer(conf.serializer), conf.output_directory, include_label_text=conf.include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp) def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult: with Event(event_id=event_id, client=self.events_client) as event: document = event.create_document('plaintext', text=text) f = self.pipeline.run(document) return f def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.pipeline.close() if self.close_client: self.events_client.close()
class DefaultPipeline: def __init__(self, conf: DefaultPipelineConf, *, events_client: EventsClient = None): if events_client is not None: self.close_client = False self.events_client = events_client elif conf.events_address is not None: self.close_client = True self.events_client = EventsClient(address=conf.events_address) else: raise ValueError("Events client or address not specified.") pipeline = [(conf.sentences_id, conf.sentences_address), (conf.tagger_id, conf.tagger_address), (conf.acronyms_id, conf.acronyms_address), (conf.concepts_id, conf.concepts_address), (conf.negation_id, conf.negation_address)] if conf.use_discovery: self.pipeline = Pipeline( *[RemoteProcessor(identifier) for identifier, _ in pipeline], n_threads=conf.threads) else: self.pipeline = Pipeline(*[ RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline ], n_threads=conf.threads) if conf.serializer is not None: serialization_proc = SerializationProcessor( get_serializer(conf.serializer), conf.output_directory, include_label_text=conf.include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp) def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult: with Event(event_id=event_id, client=self.events_client) as event: document = event.create_document('plaintext', text=text) f = self.pipeline.run(document) return f def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.pipeline.close() if self.close_client: self.events_client.close()
class DefaultPipeline: """The biomedicus default pipeline for processing clinical documents. Attributes events_client (mtap.EventsClient): An MTAP events client used by the pipeline. pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents. """ def __init__(self, conf_path: Union[str, Path], output_directory: Union[str, Path], *, events_address: Optional[str] = None, events_client: EventsClient = None, serializer: Optional[str] = None, include_label_text: bool = False): if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '': events_address = None if events_client is not None: self.close_client = False self.events_client = events_client else: self.close_client = True self.events_client = EventsClient(address=events_address) self.pipeline = Pipeline.from_yaml_file(conf_path) if serializer == 'None': serializer = None if serializer is not None: serialization_proc = SerializationProcessor( get_serializer(serializer), output_directory, include_label_text=include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp) def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult: with Event(event_id=event_id, client=self.events_client) as event: document = event.create_document('plaintext', text=text) f = self.pipeline.run(document) return f def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.pipeline.close() if self.close_client: self.events_client.close()
class DefaultPipeline: """The biomedicus default pipeline for processing clinical documents. Attributes events_client (mtap.EventsClient): An MTAP events client used by the pipeline. pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents. """ def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None): conf.populate_addresses() if events_client is not None: self.close_client = False self.events_client = events_client elif conf.events_address is not None: self.close_client = True self.events_client = EventsClient(address=conf.events_address) else: raise ValueError("Events client or address not specified.") pipeline = [(conf.sentences_id, conf.sentences_address), (conf.tagger_id, conf.tagger_address)] if conf.use_discovery: self.pipeline = Pipeline( *[RemoteProcessor(identifier) for identifier, _ in pipeline]) else: self.pipeline = Pipeline(*[ RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline ]) def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult: with Event(event_id=event_id, client=self.events_client) as event: document = event.create_document('plaintext', text=text) f = self.pipeline.run(document) return f def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.pipeline.close() if self.close_client: self.events_client.close()
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_FILE', help='The input GENIA XML file.') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--tnt-trainer', metavar='TRAINER', default=None, help='The address of the TnT trainer.') args = parser.parse_args(args) etree = ElementTree.parse(args.input) set = etree.getroot() with EventsClient(args.events) as client, Pipeline( RemoteProcessor('biomedicus-tnt-trainer', address=args.tnt_trainer)) as pipeline: for article in set.findall('article'): id = list(article.find('articleinfo'))[0].text with Event(id, client) as event: db = DocumentBuilder() for sentence in article.find('title').findall( 'sentence') + article.find('abstract').findall( 'sentence'): db.add_sentence(sentence) d = db.build_doc(event) pipeline.run(d)
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--events") parser.add_argument("--tagger") parser.add_argument("--sentences") parser.add_argument("--acronyms") parser.add_argument("--norms") parser.add_argument("--concepts") args = parser.parse_args(args) input_dir = Path(args.input_directory) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('biomedicus-sentences', address=args.sentences), RemoteProcessor('biomedicus-tnt-tagger', address=args.tagger), RemoteProcessor('biomedicus-acronyms', address=args.acronyms), RemoteProcessor('biomedicus-concepts', address=args.concepts), LocalProcessor(SerializationProcessor( JsonSerializer, output_dir=args.output_directory), component_id='serialize', client=client)) as pipeline: for path in input_dir.glob("**/*.txt"): print("READING FILE:", str(path)) with path.open('r') as f: contents = f.read() with Event(event_id=path.stem, client=client) as event: document = event.create_document("plaintext", text=contents) pipeline.run(document) pipeline.print_times()
def test_dependencies(events_service, dependencies_service, test_results): test_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'dependencies' uas = Accuracy('UAS', equivalence_test=uas_equal) las = Accuracy('LAS', equivalence_test=las_equal) with EventsClient(address=events_service) as client, \ Pipeline( RemoteProcessor(processor_id='biomedicus-dependencies', address=dependencies_service), LocalProcessor(Metrics(uas, las, tested='dependencies', target='gold_dependencies'), component_id='accuracy', client=client) ) as pipeline: for test_file in test_dir.glob('**/*.pickle'): with PickleSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) accuracy_dict = results.component_result( 'accuracy').result_dict print('Results for document: UAS: {}. LAS: {}.'.format( accuracy_dict['UAS'], accuracy_dict['LAS'])) print('UAS:', uas.value) print('LAS:', las.value) timing_info = pipeline.processor_timer_stats( 'biomedicus-dependencies').timing_info test_results['biomedicus-dependencies'] = { 'UAS': uas.value, 'LAS': las.value, 'Corpus': "MiPACQ converted to UD from PTB test set", 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) }
def test_tnt_performance(events_service, pos_tags_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'pos_tags' accuracy = Accuracy() with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-tnt-tagger', address=pos_tags_service, params={'token_index': 'gold_tags'}), LocalProcessor(Metrics(accuracy, tested='pos_tags', target='gold_tags'), component_id='metrics'), events_client=client) as pipeline: for test_file in input_dir.glob('**/*.pickle'): event = PickleSerializer.file_to_event(test_file, client=client) with event: document = event.documents['gold'] results = pipeline.run(document) print( 'Accuracy for event - ', event.event_id, ':', results.component_result( 'metrics').result_dict['accuracy']) print('Accuracy:', accuracy.value) pipeline.print_times() timing_info = pipeline.processor_timer_stats( 'biomedicus-tnt-tagger').timing_info test_results['TnT Pos Tagger'] = { 'Accuracy': accuracy.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert accuracy.value > 0.9
def test_concepts_performance(events_service, concepts_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'concepts' recall = Accuracy(name='recall', mode='any', fields=['cui']) precision = Accuracy(name='precision', mode='any', fields=['cui']) with EventsClient(address=events_service) as client, \ Pipeline( RemoteProcessor(processor_id='biomedicus-concepts', address=concepts_service), LocalProcessor(Metrics(recall, tested='umls_concepts', target='gold_concepts'), component_id='metrics'), LocalProcessor(Metrics(precision, tested='gold_concepts', target='umls_concepts'), component_id='metrics_reverse'), events_client=client ) as pipeline: for test_file in input_dir.glob('**/*.pickle'): with PickleSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] pipeline.run(document) print('Precision:', precision.value) print('Recall:', recall.value) timing_info = pipeline.processor_timer_stats('biomedicus-concepts').timing_info test_results['Concepts'] = { 'Precision': precision.value, 'Recall': recall.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert recall.value > 0.6
def test_java_references(python_events, java_references_processor): with EventsClient(address=python_events) as client, Pipeline( RemoteProcessor('mtap-java-reference-labels-example-processor', address=java_references_processor) ) as pipeline: with Event(event_id='1', client=client) as event: document = event.create_document('plaintext', 'abcd') pipeline.run(document) references = document.labels['references'] assert references[0].a == GenericLabel(0, 1) assert references[0].b == GenericLabel(1, 2) assert references[1].a == GenericLabel(2, 3) assert references[1].b == GenericLabel(3, 4) map_references = document.labels['map_references'] assert map_references[0].ref == { 'a': GenericLabel(0, 1), 'b': GenericLabel(1, 2), 'c': GenericLabel(2, 3), 'd': GenericLabel(3, 4) } list_references = document.labels['list_references'] assert list_references[0].ref == [GenericLabel(0, 1), GenericLabel(1, 2)] assert list_references[1].ref == [GenericLabel(2, 3), GenericLabel(3, 4)]
def test_modification_detector_performance(events_service, modification_detector_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'negation' / 'i2b2_2010' confusion = metrics.FirstTokenConfusion() metrics_processor = metrics.Metrics(confusion, tested='negated', target='i2b2concepts', target_filter=is_negated) with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor('biomedicus-negation', address=modification_detector_service, params={'terms_index': 'i2b2concepts'}), LocalProcessor(metrics_processor, component_id='metrics', client=client) ) as pipeline: for test_file in input_dir.glob('**/*.pickle'): with PickleSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format( event.event_id, results.component_result('metrics').result_dict['first_token_confusion']['f1'], results.component_result('biomedicus-negation').timing_info['process_method'] )) print('Overall Precision:', confusion.precision) print('Overall Recall:', confusion.recall) print('Overall F1:', confusion.f1) pipeline.print_times() timing_info = pipeline.processor_timer_stats('biomedicus-negation').timing_info test_results['biomedicus-modification'] = { 'Gold Standard': "2010 i2b2-VA", 'Precision': confusion.precision, 'Recall': confusion.recall, 'F1': confusion.f1, 'Per-Document Mean Remote Call Duration': str(timing_info['remote_call'].mean), 'Per-Document Mean Process Method Duration': str(timing_info['process_method'].mean) }
def test_sentence_performance(events_service, sentences_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'sentences' confusion = metrics.FirstTokenConfusion() with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-sentences', address=sentences_service), LocalProcessor(metrics.Metrics(confusion, tested='sentences', target='Sentence'), component_id='metrics', client=client)) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format( event.event_id, results[1].results['first_token_confusion']['f1'], results[0].timing_info['process_method'])) print('Overall Precision:', confusion.precision) print('Overall Recall:', confusion.recall) print('Overall F1:', confusion.f1) pipeline.print_times() timing_info = pipeline.processor_timer_stats()[0].timing_info test_results['Sentences'] = { 'Precision': confusion.precision, 'Recall': confusion.recall, 'F1': confusion.f1, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert confusion.f1 > 0.85
def main(args=None): parser = ArgumentParser() parser.add_argument('--events-service') parser.add_argument('--sentences-service') parser.add_argument('--dependencies-service') parser.add_argument('input_file') conf = parser.parse_args(args) with EventsClient(address=conf.events_service) as client, \ Pipeline( RemoteProcessor('biomedicus-sentences', address=conf.sentences_service), RemoteProcessor('biomedicus-dependencies', address=conf.dependencies_service) ) as pipeline: with open(conf.input_file, 'r') as in_f: txt = in_f.read() with Event(event_id=Path(conf.input_file).name, client=client) as event: document = event.create_document('plaintext', txt) pipeline.run(document) for sentence in document.labels['sentences']: print(sentence.text) print('\n') for dependency in document.labels['dependencies'].inside( sentence): print((dependency.text, dependency.deprel, dependency.head.text if dependency.head is not None else 'ROOT')) print('\n')
def test_acronyms_performance(events_service, acronyms_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_PHI_TEST_DATA']) / 'acronyms' top_score_accuracy = Accuracy(name='top_score_accuracy', fields=['expansion']) any_accuracy = Accuracy(name='any_accuracy', mode='any', fields=['expansion']) detection_recall = Accuracy(name='detection_recall', mode='location', fields=['expansion']) detection_precision = Accuracy(name='detection_precision', mode='location', fields=['expansion']) with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-acronyms', address=acronyms_service), LocalProcessor(Metrics(top_score_accuracy, detection_recall, tested='acronyms', target='gold_acronyms'), component_id='top_score_metrics', client=client), LocalProcessor(Metrics(detection_precision, tested='gold_acronyms', target='acronyms'), component_id='top_score_reverse', client=client), LocalProcessor(Metrics(any_accuracy, tested='all_acronym_senses', target='gold_acronyms'), component_id='all_senses_metrics', client=client)) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] pipeline.run(document) print('Top Sense Accuracy:', top_score_accuracy.value) print('Any Sense Accuracy:', any_accuracy.value) print('Detection Recall:', detection_recall.value) print('Detection Precision:', detection_precision.value) pipeline.print_times() timing_info = pipeline.processor_timer_stats( 'biomedicus-acronyms').timing_info test_results['acronyms'] = { 'Top sense accuracy': top_score_accuracy.value, 'Any sense accuracy': any_accuracy.value, 'Detection Recall': detection_recall.value, 'Detection Precision': detection_precision.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert top_score_accuracy.value > 0.4 assert any_accuracy.value > 0.4 assert detection_recall.value > 0.65
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("concepts_csv", metavar="PATH_TO_CONCEPTS_CSV") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--sentences") parser.add_argument("--tagger") parser.add_argument("--acronyms") parser.add_argument("--events") ns = parser.parse_args(args) print('Reading concepts csv...') concepts = {} with open(ns.concepts_csv, 'r') as f: for line in f.readlines(): splits = line.split(',') end = splits[0] start = splits[1] cui = splits[5] identifier = splits[6] try: v = concepts[identifier] except KeyError: v = [] concepts[identifier] = v v.append((start, end, cui)) print('Reading mipacq source files...') with EventsClient(address=ns.events) as client, \ Pipeline( RemoteProcessor('biomedicus-sentences', address=ns.sentences), RemoteProcessor('biomedicus-tnt-tagger', address=ns.tagger), RemoteProcessor('biomedicus-acronyms', address=ns.acronyms), LocalProcessor(SerializationProcessor(PickleSerializer, output_dir=ns.output_directory), component_id='serialize', client=client) ) as pipeline: for path in Path(ns.input_directory).glob('**/*.source'): identifier = path.stem.split('-')[0] try: doc_concepts = concepts[identifier] except KeyError: continue with Event(event_id=identifier, client=client) as event: with path.open('r') as f: text = f.read() document = event.create_document('plaintext', text) with document.get_labeler('gold_concepts') as label_concept: for start, end, cui in doc_concepts: label_concept(start, end, cui=cui) pipeline.run(document)
def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None): conf.populate_addresses() if events_client is not None: self.close_client = False self.events_client = events_client elif conf.events_address is not None: self.close_client = True self.events_client = EventsClient(address=conf.events_address) else: raise ValueError("Events client or address not specified.") pipeline = [(conf.sentences_id, conf.sentences_address), (conf.section_headers_id, conf.section_headers_address), (conf.tagger_id, conf.tagger_address), (conf.acronyms_id, conf.acronyms_address), (conf.concepts_id, conf.concepts_address), (conf.negation_id, conf.negation_address), (conf.selective_dependencies_id, conf.selective_dependencies_address), (conf.deepen_id, conf.deepen_address)] if conf.use_discovery: self.pipeline = Pipeline( *[RemoteProcessor(identifier) for identifier, _ in pipeline]) else: self.pipeline = Pipeline(*[ RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline ]) if conf.serializer is not None: serialization_proc = SerializationProcessor( get_serializer(conf.serializer), conf.output_directory, include_label_text=conf.include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp)
def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None): conf.populate_addresses() if events_client is not None: self.close_client = False self.events_client = events_client elif conf.events_address is not None: self.close_client = True self.events_client = EventsClient(address=conf.events_address) else: raise ValueError("Events client or address not specified.") pipeline = [(conf.sentences_id, conf.sentences_address), (conf.tagger_id, conf.tagger_address)] if conf.use_discovery: self.pipeline = Pipeline( *[RemoteProcessor(identifier) for identifier, _ in pipeline]) else: self.pipeline = Pipeline(*[ RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline ])
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_FOLDER', help='A folder containing PTB formatted documents.') parser.add_argument('--glob', metavar='GLOB', default='*.mrg') parser.add_argument('--source-name', metavar='DOCUMENT_NAME', default='source', help='What document to dump the PTB text into.') parser.add_argument( '--target-name', metavar='DOCUMENT_NAME', default='plaintext', help='What document to the plaintext and annotations into.') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--ptb-reader', metavar='READER', default=None, help='The address of the PTB Reader.') parser.add_argument('--tnt-trainer', metavar='TRAINER', default=None, help='The address of the TnT trainer.') args = parser.parse_args(args) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('ptb-reader', address=args.ptb_reader, params={ 'source_document_name': args.source_name, 'target_document_name': args.target_name }), RemoteProcessor('biomedicus-tnt-trainer', address=args.tnt_trainer, params={'document_name': args.target_name})) as pipeline: for f in Path(args.input).rglob(args.glob): print('Reading:', f) with f.open('r') as r: text = r.read() with Event(event_id=f.name, client=client) as event: d = Document(args.source_name, text=text) event.add_document(d) pipeline.run(event)
def test_normalization(events_service, normalization_processor): with EventsClient(address=events_service) as client, \ Pipeline(RemoteProcessor(processor_id='biomedicus_normalizer', address=normalization_processor)) as pipeline, \ PickleSerializer.file_to_event(Path(__file__).parent / '97_95.pickle', client=client) as event: document = event.documents['plaintext'] pipeline.run(document) for norm_form in document.get_label_index('norm_forms'): if norm_form.text == "according": assert norm_form.norm == "accord" if norm_form.text == "expressing": assert norm_form.norm == "express" if norm_form.text == "receiving": assert norm_form.norm == "receive" if norm_form.text == "days": assert norm_form.norm == "day"
def main(args=None): parser = ArgumentParser() parser.add_argument('--events-service', default='localhost:10100') parser.add_argument('--sentences-service', default='localhost:10102') conf = parser.parse_args(args) with Pipeline( RemoteProcessor( 'biomedicus-sentences', address=conf.sentences_service)) as pipeline, EventsClient( address=conf.events_service) as events_client: text = sys.stdin.read() with Event(client=events_client) as event: doc = event.create_document('plaintext', text) result = pipeline.run(doc) for sentence in doc.get_label_index('sentences'): print('S: "', sentence.text, '"') for k, v in result[0].timing_info.items(): print('{}: {}'.format(k, v))
def test_disc_pipeline(disc_python_events, disc_python_processor, disc_java_processor): with EventsClient(address=disc_python_events) as client, mtap.Pipeline( RemoteProcessor('mtap-example-processor-python', address='localhost:50501', params={'do_work': True}), RemoteProcessor('mtap-example-processor-java', address='localhost:50502', params={'do_work': True}) ) as pipeline: with Event(event_id='1', client=client) as event: event.metadata['a'] = 'b' document = event.create_document('plaintext', PHASERS) pipeline.run(document) letter_counts = document.get_label_index('mtap.examples.letter_counts') a_counts = letter_counts[0] assert a_counts.count == 23 b_counts = letter_counts[1] assert b_counts.count == 6 pipeline.print_times() thes = document.get_label_index("mtap.examples.word_occurrences") assert thes[0].start_index == 121 assert thes[0].end_index == 124
def main(args=None): parser = ArgumentParser( description= 'Converts files from the i2b2/VA 2010 format to serialized MTAP events ' 'containing the ') parser.add_argument( 'input_directory', type=Path, help= 'An input directory containing a "txt" folder containing text files ' 'and an "ast" folder containing the assertions in the i2b2/VA ' 'pipe-delimited format.') parser.add_argument( 'output_directory', type=Path, help='An output directory to write the serialized mtap events to.') parser.add_argument('--target-document', default='plaintext') parser.add_argument('--serializer', default='pickle', choices=standard_serializers.keys(), help='The serializer to use.') parser.add_argument('--events', help="Address of the events client.") parser.add_argument('--tagger', help="Address of the pos tagger to use.") conf = parser.parse_args(args) serializer = standard_serializers[conf.serializer] with EventsClient(address=conf.events) as client, Pipeline( LocalProcessor(OnePerLineSentencesProcessor(), component_id='sentences', client=client), RemoteProcessor('biomedicus-tnt-tagger', address=conf.tagger), LocalProcessor(SerializationProcessor( serializer, output_dir=conf.output_directory), component_id='serializer', client=client)) as pipeline: results = pipeline.run_multithread( events(conf.input_directory, conf.target_document, client=client)) pipeline.print_times()
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_DIR', help='A folder containing PTB formatted documents.') parser.add_argument('output', metavar='OUTPUT_DIR', help='A folder to write the json files to.') parser.add_argument('--glob', metavar='GLOB', default='*.mrg') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--ptb-reader', metavar='READER', default=None, help='The address of the PTB Reader.') args = parser.parse_args(args) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('ptb-reader', address=args.ptb_reader, params={ 'source_document_name': 'source', 'target_document_name': 'gold', 'pos_tags_index': 'gold_tags' }), LocalProcessor(SerializationProcessor(JsonSerializer, output_dir=args.output), component_id='serializer', client=client)) as pipeline: for f in Path(args.input).rglob(args.glob): print('Reading:', f) with f.open('r') as r: text = r.read() with Event(event_id=f.name, client=client) as event: d = Document('source', text=text) event.add_document(d) pipeline.run(event)
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--events") parser.add_argument("--rtf") parser.add_argument("--tagger") parser.add_argument("--acronyms") parser.add_argument("--sentences") args = parser.parse_args(args) input_dir = Path(args.input_directory) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('rtf-processor', address=args.rtf, params={ 'binary_data_name': 'rtf', 'output_document_name': 'plaintext' }), RemoteProcessor('sentences', address=args.sentences, params={'document_name': 'plaintext'}), RemoteProcessor('tnt-tagger', address=args.tagger, params={'document_name': 'plaintext'}), RemoteProcessor('acronyms', address=args.acronyms), LocalProcessor(SerializationProcessor( JsonSerializer, output_dir=args.output_directory), component_id='serialize', client=client)) as pipeline: for path in input_dir.glob("**/*.rtf"): with path.open('rb') as f: contents = f.read() with Event(event_id=path.stem, client=client) as event: event.binaries['rtf'] = contents pipeline.run(event) pipeline.print_times()
def call(): client = EventsClient(address='a', _pool=object(), _channel=events_channel) result = client.get_label_index_info(event_id='1', document_name='plaintext') return result
def call(): client = EventsClient(address='a', _pool=object(), _channel=events_channel) result = client.get_all_binary_data_names(event_id='1') return result
class Pipeline(MutableSequence['processing.ComponentDescriptor']): """An object which can be used to build and run a pipeline of remote and local processors. Pipelines are a :obj:`~typing.MutableSequence` containing one or more :obj:`~mtap.processing.pipeline.ComponentDescriptor`, a pipeline can be modified after creation using this functionality. Args: *components (ComponentDescriptor): A list of component descriptors created using :class:`RemoteProcessor` or :class:`LocalProcessor`. Keyword Args: name (~typing.Optional[str]): An optional name for the pipeline, defaults to 'pipeline'. config (~typing.Optional[Config]): An optional config override. Examples: Remote pipeline with name discovery: >>> with mtap.Events() as events, mtap.Pipeline( >>> RemoteProcessor('processor-1-id'), >>> RemoteProcessor('processor-2-id'), >>> RemoteProcessor('processor-3-id') >>> ) as pipeline: >>> for txt in txts: >>> with events.open_event() as event: >>> document = event.add_document('plaintext', txt) >>> results = pipeline.run(document) Remote pipeline using addresses: >>> with mtap.Events(address='localhost:50051') as events, mtap.Pipeline( >>> RemoteProcessor('processor-1-name', address='localhost:50052'), >>> RemoteProcessor('processor-2-id', address='localhost:50053'), >>> RemoteProcessor('processor-3-id', address='localhost:50054') >>> ) as pipeline: >>> for txt in txts: >>> event = events.open_event() >>> document = event.add_document('plaintext', txt) >>> results = pipeline.run(document) Modifying pipeline >>> pipeline = Pipeline(RemoteProcessor('foo', address='localhost:50000'), RemoteProcessor('bar', address='localhost:50000')) >>> pipeline Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='bar', address='localhost:50000', component_id=None, params=None)) >>> pipeline.append(RemoteProcessor('baz', address='localhost:50001')) >>> pipeline Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='bar', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='baz', address='localhost:50001', component_id=None, params=None)) >>> del pipeline[1] >>> pipeline Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='baz', address='localhost:50001', component_id=None, params=None)) >>> pipeline[1] = RemoteProcessor(processor_id='bar', address='localhost:50003') >>> pipeline Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='bar', address='localhost:50003', component_id=None, params=None)) >>> pipeline += list(pipeline) # Putting in a new list to prevent an infinite recursion >>> pipeline Pipeline(RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='bar', address='localhost:50003', component_id=None, params=None), RemoteProcessor(processor_id='foo', address='localhost:50000', component_id=None, params=None), RemoteProcessor(processor_id='bar', address='localhost:50003', component_id=None, params=None)) Attributes: name (str): The pipeline's name. """ __slots__ = [ '_component_ids', 'name', '_component_descriptors', 'events_address', 'mp_config', '_created_events_client', '_events_client', 'times_map', '__components' ] def __init__(self, *components: 'processing.ComponentDescriptor', name: Optional[str] = None, events_address: Optional[str] = None, events_client: Optional[EventsClient] = None, mp_config: Optional[MpConfig] = None): self._component_ids = {} self.name = name or 'pipeline' self._component_descriptors = list(components) self.events_address = events_address self._created_events_client = False self._events_client = None if events_client is not None: self.events_client = events_client self.mp_config = mp_config or MpConfig() self.times_map = {} def __reduce__(self): return _create_pipeline, (self.name, self.events_address, self._events_client, self.mp_config) + tuple( self._component_descriptors) @staticmethod def from_yaml_file(conf_path: Union[pathlib.Path, str]) -> 'Pipeline': """Creates a pipeline from a yaml pipeline configuration file. Args: conf_path (str or pathlib.Path): The path to the configuration file. Returns: Pipeline object from the configuration. """ conf_path = pathlib.Path(conf_path) from yaml import load try: from yaml import CLoader as Loader except ImportError: from yaml import Loader with conf_path.open('rb') as f: conf = load(f, Loader=Loader) return Pipeline.load_configuration(conf) @staticmethod def load_configuration(conf: Dict) -> 'Pipeline': """Creates a pipeline from a pipeline configuration dictionary. Args: conf (Dict): The pipeline configuration dictionary. Returns: Pipeline created from the configuration. """ name = conf.get('name', None) events_address = conf.get('events_address', None) or conf.get( 'events_addresses', None) components = [] conf_components = conf.get('components', []) for conf_component in conf_components: components.append( RemoteProcessor(processor_id=conf_component['processor_id'], address=conf_component['address'], component_id=conf_component.get( 'component_id', None), params=dict(conf_component.get('params', {})))) mp_config = MpConfig.from_configuration(conf.get('mp_config', {})) return Pipeline(*components, name=name, events_address=events_address, mp_config=mp_config) @property def events_client(self) -> EventsClient: if self._events_client is not None: return self._events_client self._created_events_client = True self._events_client = EventsClient(address=self.events_address) return self._events_client @events_client.setter def events_client(self, value: EventsClient): self._events_client = value @property def _components(self) -> 'List[processing.ProcessingComponent]': try: return self.__components except AttributeError: self.__components = [ desc.create_pipeline_component(self._component_ids, lambda: self.events_client) for desc in self._component_descriptors ] return self.__components @_components.deleter def _components(self): for component in self.__components: component.close() del self.__components def run_multithread(self, source: Union[Iterable[Union['mtap.Document', 'mtap.Event']], 'processing.ProcessingSource'], *, params: Optional[Dict[str, Any]] = None, show_progress: Optional[bool] = None, total: Optional[int] = None, close_events: Optional[bool] = None, max_failures: Optional[int] = None, workers: Optional[int] = None, read_ahead: Optional[int] = None, mp_context=None): """Runs this pipeline on a source which provides multiple documents / events. Concurrency is per-event, with each event being provided a thread which runs it through the pipeline. Args: source (~typing.Union[~typing.Iterable[~typing.Union[Event, Document]], ProcessingSource]) A generator of events or documents to process. This should be an :obj:`~typing.Iterable` of either :obj:`Event` or :obj:`Document` objects or a :obj:`~mtap.processing.ProcessingSource`. params (~typing.Optional[dict[str, ~typing.Any]]) Json object containing params specific to processing this event, the existing params dictionary defined in :func:`~PipelineBuilder.add_processor` will be updated with the contents of this dict. show_progress (~typing.Optional[bool]) Whether to print a progress bar using tqdm. total (~typing.Optional[int]) An optional argument indicating the total number of events / documents that will be provided by the iterable, for the progress bar. close_events (~typing.Optional[bool]) Whether the pipeline should close events after they have been fully processed through all components. max_failures (~typing.Optional[int]) The number of acceptable failures. Once this amount is exceeded processing will halt. Note that because of the nature of conccurrency processing may continue for a short amount of time before termination. workers (~typing.Optional[int]) The number of threads to process documents on. read_ahead (~typing.Optional[int]) The number of source documents to read ahead into memory before processing. mp_context (multiprocessing context, optional) An optional override for the multiprocessing context. Examples: >>> docs = list(Path('abc/').glob('*.txt')) >>> def document_source(): >>> for path in docs: >>> with path.open('r') as f: >>> txt = f.read() >>> with Event(event_id=path.name, client=client) as event: >>> doc = event.create_document('plaintext', txt) >>> yield doc >>> >>> pipeline.run_multithread(document_source(), total=len(docs)) """ show_progress = show_progress if show_progress is not None else self.mp_config.show_progress close_events = close_events if close_events is not None else self.mp_config.close_events max_failures = max_failures if max_failures is not None else self.mp_config.max_failures workers = workers if workers is not None else self.mp_config.workers mp_context = (multiprocessing.get_context( self.mp_config.mp_start_method) if mp_context is None else mp_context) read_ahead = read_ahead if read_ahead is not None else self.mp_config.read_ahead with _PipelineMultiRunner(self, source, params, show_progress, total, close_events, max_failures, workers, read_ahead, mp_context) as runner: runner.run() def run( self, target: Union['mtap.Event', 'mtap.Document'], *, params: Optional[Dict[str, Any]] = None) -> 'processing.PipelineResult': """Processes the event/document using all of the processors in the pipeline. Args: target (~typing.Union[Event, Document]): Either an event or a document to process. params (dict[str, ~typing.Any]): Json object containing params specific to processing this event, the existing params dictionary defined in :func:`~PipelineBuilder.add_processor` will be updated with the contents of this dict. Returns: list[ProcessingResult]: The results of all the processors in the pipeline. Examples: >>> e = mtap.Event() >>> document = mtap.Document('plaintext', text="...", event=e) >>> with Pipeline(...) as pipeline: >>> pipeline.run(document) >>> # is equivalent to pipeline.run(document.event, params={'document_name': document.document_name}) The 'document_name' param is used to indicate to :obj:`~mtap.DocumentProcessor` which document on the event to process. """ event, params = _event_and_params(target, params) event_id = event.event_id result = self._run_by_event_id(event_id, event.event_service_instance_id, params) self._add_result_times(result) for component_result in result.component_results: try: event.add_created_indices(component_result.created_indices) except AttributeError: pass return result def _run_by_event_id(self, event_id, event_service_instance_id, params): start = datetime.now() results = [ component.call_process(event_id, event_service_instance_id, params) for component in self._components ] total = datetime.now() - start results = [ _base.ProcessingResult(identifier=component.component_id, result_dict=result[0], timing_info=result[1], created_indices=result[2]) for component, result in zip(self._components, results) ] logger.debug('Finished processing event_id: %s', event_id) return _base.PipelineResult(results, total) def _add_result_times(self, result): times = {} for component_id, _, component_times, _ in result.component_results: times.update({ component_id + ':' + k: v for k, v in component_times.items() }) times[self.name + 'total'] = result.elapsed_time _timing.add_times(self.times_map, times) @overload def processor_timer_stats(self) -> 'List[processing.AggregateTimingInfo]': """Returns the timing information for all processors. Returns: List[AggregateTimingInfo]: A list of timing info objects, one for each processor, in the same order that the processors were added to the pipeline. """ ... @overload def processor_timer_stats( self, identifier: str) -> 'processing.AggregateTimingInfo': """Returns the timing info for one processor. Args: identifier (Optional[str]): The pipeline component_id for the processor to return timing info. Returns: AggregateTimingInfo: The timing info for the specified processor. """ ... def processor_timer_stats(self, identifier=None): if identifier is not None: aggregates = _timing.create_timer_stats(self.times_map, identifier + ':') aggregates = { k[(len(identifier) + 1):]: v for k, v in aggregates.items() } return _base.AggregateTimingInfo(identifier=identifier, timing_info=aggregates) timing_infos = [] for component in self._components: component_id = component.component_id aggregates = _timing.create_timer_stats(self.times_map, component_id + ':') aggregates = { k[(len(component_id) + 1):]: v for k, v in aggregates.items() } timing_infos.append( _base.AggregateTimingInfo(identifier=component_id, timing_info=aggregates)) return timing_infos def pipeline_timer_stats(self) -> 'processing.AggregateTimingInfo': """The aggregated statistics for the global runtime of the pipeline. Returns: AggregateTimingInfo: The timing stats for the global runtime of the pipeline. """ pipeline_id = self.name aggregates = _timing.create_timer_stats(self.times_map, pipeline_id) aggregates = {k[len(pipeline_id):]: v for k, v in aggregates.items()} return _base.AggregateTimingInfo(identifier=self.name, timing_info=aggregates) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): """Closes any open connections to remote processors. """ for component in self._components: try: component.close() except AttributeError: pass if self._created_events_client: self._events_client.close() def as_processor(self) -> 'processing.EventProcessor': """Returns the pipeline as a processor. Returns: EventProcessor: An event processor that can be added to other pipelines or hosted. """ return _PipelineProcessor(self._components) def print_times(self): """Prints all of the times collected during this pipeline using :func:`print`. """ self.pipeline_timer_stats().print_times() for pipeline_timer in self.processor_timer_stats(): pipeline_timer.print_times() def __getitem__(self, item): return self._component_descriptors[item] def __setitem__(self, key, value): self._clear_components() self._component_descriptors[key] = value def __delitem__(self, key): self._clear_components() del self._component_descriptors[key] def __len__(self): return len(self._component_descriptors) def _clear_components(self): try: del self._components except AttributeError: pass def insert(self, index, o) -> None: self._clear_components() self._component_descriptors.insert(index, o) def __repr__(self): return "Pipeline(" + ', '.join( [repr(component) for component in self._component_descriptors]) + ')'
def events_client(self) -> EventsClient: if self._events_client is not None: return self._events_client self._created_events_client = True self._events_client = EventsClient(address=self.events_address) return self._events_client
# Copyright 2019 Regents of the University of Minnesota. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Hello world tutorial pipeline.""" import sys if __name__ == '__main__': from mtap import Document, Event, EventsClient, Pipeline, RemoteProcessor with EventsClient(address=sys.argv[1]) as client, \ Pipeline( RemoteProcessor(processor_id='hello', address=sys.argv[2]) ) as pipeline: with Event(event_id='1', client=client) as event: document = Document(document_name='name', text='YOUR NAME') event.add_document(document) pipeline.run(document) index = document.get_label_index('hello') for label in index: print(label.response)