def test_concepts_performance(events_service, concepts_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'concepts' recall = Accuracy(name='recall', mode='any', fields=['cui']) precision = Accuracy(name='precision', mode='any', fields=['cui']) with EventsClient(address=events_service) as client, \ Pipeline( RemoteProcessor(processor_id='biomedicus-concepts', address=concepts_service), LocalProcessor(Metrics(recall, tested='umls_concepts', target='gold_concepts'), component_id='metrics'), LocalProcessor(Metrics(precision, tested='gold_concepts', target='umls_concepts'), component_id='metrics_reverse'), events_client=client ) as pipeline: for test_file in input_dir.glob('**/*.pickle'): with PickleSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] pipeline.run(document) print('Precision:', precision.value) print('Recall:', recall.value) timing_info = pipeline.processor_timer_stats('biomedicus-concepts').timing_info test_results['Concepts'] = { 'Precision': precision.value, 'Recall': recall.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert recall.value > 0.6
def test_acronyms_performance(events_service, acronyms_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_PHI_TEST_DATA']) / 'acronyms' top_score_accuracy = Accuracy(name='top_score_accuracy', fields=['expansion']) any_accuracy = Accuracy(name='any_accuracy', mode='any', fields=['expansion']) detection_recall = Accuracy(name='detection_recall', mode='location', fields=['expansion']) detection_precision = Accuracy(name='detection_precision', mode='location', fields=['expansion']) with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-acronyms', address=acronyms_service), LocalProcessor(Metrics(top_score_accuracy, detection_recall, tested='acronyms', target='gold_acronyms'), component_id='top_score_metrics', client=client), LocalProcessor(Metrics(detection_precision, tested='gold_acronyms', target='acronyms'), component_id='top_score_reverse', client=client), LocalProcessor(Metrics(any_accuracy, tested='all_acronym_senses', target='gold_acronyms'), component_id='all_senses_metrics', client=client)) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] pipeline.run(document) print('Top Sense Accuracy:', top_score_accuracy.value) print('Any Sense Accuracy:', any_accuracy.value) print('Detection Recall:', detection_recall.value) print('Detection Precision:', detection_precision.value) pipeline.print_times() timing_info = pipeline.processor_timer_stats( 'biomedicus-acronyms').timing_info test_results['acronyms'] = { 'Top sense accuracy': top_score_accuracy.value, 'Any sense accuracy': any_accuracy.value, 'Detection Recall': detection_recall.value, 'Detection Precision': detection_precision.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert top_score_accuracy.value > 0.4 assert any_accuracy.value > 0.4 assert detection_recall.value > 0.65
def run_themes_pipeline(input_directory, annotations_directory, output_directory): events_address = 'localhost:50100' with Pipeline(RemoteProcessor('biomedicus-sentences', address='localhost:50300'), LocalProcessor( AttachPalliativeThemesProcessor(annotations_directory)), LocalProcessor(CoalescePalliativeThemesProcessor(), ), LocalProcessor( SerializationProcessor(JsonSerializer, output_directory)), events_address=events_address) as pipeline: source = FilesInDirectoryProcessingSource(pipeline.events_client, input_directory) pipeline.run_multithread(source, workers=8)
def test_run_concurrently(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} client.instance_id = 0 with Pipeline( LocalProcessor(Processor('1', ), component_id='processor1'), LocalProcessor(Processor('2', ), component_id='processor2'), LocalProcessor(Processor('3', ), component_id='processor3'), events_client=client ) as pipeline: pipeline.events_client = client events = [Event() for _ in range(10)] pipeline.run_multithread(events, show_progress=False)
def test_sentence_performance(events_service, sentences_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'sentences' confusion = metrics.FirstTokenConfusion() with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-sentences', address=sentences_service), LocalProcessor(metrics.Metrics(confusion, tested='sentences', target='Sentence'), component_id='metrics', client=client)) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format( event.event_id, results[1].results['first_token_confusion']['f1'], results[0].timing_info['process_method'])) print('Overall Precision:', confusion.precision) print('Overall Recall:', confusion.recall) print('Overall F1:', confusion.f1) pipeline.print_times() timing_info = pipeline.processor_timer_stats()[0].timing_info test_results['Sentences'] = { 'Precision': confusion.precision, 'Recall': confusion.recall, 'F1': confusion.f1, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert confusion.f1 > 0.85
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--events") parser.add_argument("--tagger") parser.add_argument("--sentences") parser.add_argument("--acronyms") parser.add_argument("--norms") parser.add_argument("--concepts") args = parser.parse_args(args) input_dir = Path(args.input_directory) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('biomedicus-sentences', address=args.sentences), RemoteProcessor('biomedicus-tnt-tagger', address=args.tagger), RemoteProcessor('biomedicus-acronyms', address=args.acronyms), RemoteProcessor('biomedicus-concepts', address=args.concepts), LocalProcessor(SerializationProcessor( JsonSerializer, output_dir=args.output_directory), component_id='serialize', client=client)) as pipeline: for path in input_dir.glob("**/*.txt"): print("READING FILE:", str(path)) with path.open('r') as f: contents = f.read() with Event(event_id=path.stem, client=client) as event: document = event.create_document("plaintext", text=contents) pipeline.run(document) pipeline.print_times()
def test_modification_detector_performance(events_service, modification_detector_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'negation' / 'i2b2_2010' confusion = metrics.FirstTokenConfusion() metrics_processor = metrics.Metrics(confusion, tested='negated', target='i2b2concepts', target_filter=is_negated) with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor('biomedicus-negation', address=modification_detector_service, params={'terms_index': 'i2b2concepts'}), LocalProcessor(metrics_processor, component_id='metrics', client=client) ) as pipeline: for test_file in input_dir.glob('**/*.pickle'): with PickleSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format( event.event_id, results.component_result('metrics').result_dict['first_token_confusion']['f1'], results.component_result('biomedicus-negation').timing_info['process_method'] )) print('Overall Precision:', confusion.precision) print('Overall Recall:', confusion.recall) print('Overall F1:', confusion.f1) pipeline.print_times() timing_info = pipeline.processor_timer_stats('biomedicus-negation').timing_info test_results['biomedicus-modification'] = { 'Gold Standard': "2010 i2b2-VA", 'Precision': confusion.precision, 'Recall': confusion.recall, 'F1': confusion.f1, 'Per-Document Mean Remote Call Duration': str(timing_info['remote_call'].mean), 'Per-Document Mean Process Method Duration': str(timing_info['process_method'].mean) }
def run_rtf_to_text_pipeline(config: Namespace): default_config = str(Path(__file__).parent / 'rtf_to_text_pipeline.yml') if config.write_config: print('Copying from "{}" to "{}"'.format( default_config, str(Path.cwd() / 'rtf_to_text_pipeline.yml'))) shutil.copy2(default_config, 'rtf_to_text_pipeline.yml') return config_file = config.config if config_file is None: config_file = default_config workers = config.workers if workers is None: workers = max(os.cpu_count() // 2, 1) with Pipeline.from_yaml_file(config_file) as pipeline: pipeline += [ LocalProcessor(WritePlaintext(Path(config.output_directory)), component_id='write_text') ] input_directory = Path(config.input_directory) source = rtf_source(input_directory, config.extension_glob, pipeline.events_client) total = sum(1 for _ in input_directory.rglob(config.extension_glob)) pipeline.run_multithread(source, workers=workers, total=total, max_failures=config.max_failures) pipeline.print_times()
def test_tnt_performance(events_service, pos_tags_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'pos_tags' accuracy = Accuracy() with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-tnt-tagger', address=pos_tags_service, params={'token_index': 'gold_tags'}), LocalProcessor(Metrics(accuracy, tested='pos_tags', target='gold_tags'), component_id='metrics'), events_client=client) as pipeline: for test_file in input_dir.glob('**/*.pickle'): event = PickleSerializer.file_to_event(test_file, client=client) with event: document = event.documents['gold'] results = pipeline.run(document) print( 'Accuracy for event - ', event.event_id, ':', results.component_result( 'metrics').result_dict['accuracy']) print('Accuracy:', accuracy.value) pipeline.print_times() timing_info = pipeline.processor_timer_stats( 'biomedicus-tnt-tagger').timing_info test_results['TnT Pos Tagger'] = { 'Accuracy': accuracy.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert accuracy.value > 0.9
def test_run_multi(mocker): client = mocker.Mock(EventsClient) client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} processor1 = Processor('1') processor2 = Processor('2') processor3 = Processor('3') with Pipeline( LocalProcessor(processor1, component_id='processor1', client=client), LocalProcessor(processor2, component_id='processor2', client=client), LocalProcessor(processor3, component_id='processor3', client=client) ) as pipeline: events = [Event() for _ in range(10)] results = pipeline.run_multithread(events, progress=False) for result in results: assert len(result) == 3
def test_run_concurrently_with_failure(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} client.instance_id = 0 with Pipeline( LocalProcessor(Processor('1', ), component_id='processor1'), LocalProcessor(Processor('2', ), component_id='processor2'), LocalProcessor(Processor('3', ), component_id='processor3'), events_client=client ) as pipeline: events = [Event(event_id=str(i), client=client) for i in range(7)] + [ Event(event_id='fail_' + str(i), client=client) for i in range(4)] with pytest.raises(ValueError) as e_info: pipeline.run_multithread(events, show_progress=False, max_failures=2)
def __init__(self, conf_path: Union[str, Path], output_directory: Union[str, Path], *, events_address: Optional[str] = None, events_client: EventsClient = None, serializer: Optional[str] = None, include_label_text: bool = False): if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '': events_address = None if events_client is not None: self.close_client = False self.events_client = events_client else: self.close_client = True self.events_client = EventsClient(address=events_address) self.pipeline = Pipeline.from_yaml_file(conf_path) if serializer == 'None': serializer = None if serializer is not None: serialization_proc = SerializationProcessor( get_serializer(serializer), output_directory, include_label_text=include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp)
def test_dependencies(events_service, dependencies_service, test_results): test_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'dependencies' uas = Accuracy('UAS', equivalence_test=uas_equal) las = Accuracy('LAS', equivalence_test=las_equal) with EventsClient(address=events_service) as client, \ Pipeline( RemoteProcessor(processor_id='biomedicus-dependencies', address=dependencies_service), LocalProcessor(Metrics(uas, las, tested='dependencies', target='gold_dependencies'), component_id='accuracy', client=client) ) as pipeline: for test_file in test_dir.glob('**/*.pickle'): with PickleSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) accuracy_dict = results.component_result( 'accuracy').result_dict print('Results for document: UAS: {}. LAS: {}.'.format( accuracy_dict['UAS'], accuracy_dict['LAS'])) print('UAS:', uas.value) print('LAS:', las.value) timing_info = pipeline.processor_timer_stats( 'biomedicus-dependencies').timing_info test_results['biomedicus-dependencies'] = { 'UAS': uas.value, 'LAS': las.value, 'Corpus': "MiPACQ converted to UD from PTB test set", 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) }
def test_time_result(): processor = Processor() with Pipeline( LocalProcessor(processor, component_id='test_processor', client=None) ) as pipeline: event = Event() results = pipeline.run(event) result = results[0] assert result.timing_info['process_method'] >= timedelta(seconds=0.001)
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("concepts_csv", metavar="PATH_TO_CONCEPTS_CSV") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--sentences") parser.add_argument("--tagger") parser.add_argument("--acronyms") parser.add_argument("--events") ns = parser.parse_args(args) print('Reading concepts csv...') concepts = {} with open(ns.concepts_csv, 'r') as f: for line in f.readlines(): splits = line.split(',') end = splits[0] start = splits[1] cui = splits[5] identifier = splits[6] try: v = concepts[identifier] except KeyError: v = [] concepts[identifier] = v v.append((start, end, cui)) print('Reading mipacq source files...') with EventsClient(address=ns.events) as client, \ Pipeline( RemoteProcessor('biomedicus-sentences', address=ns.sentences), RemoteProcessor('biomedicus-tnt-tagger', address=ns.tagger), RemoteProcessor('biomedicus-acronyms', address=ns.acronyms), LocalProcessor(SerializationProcessor(PickleSerializer, output_dir=ns.output_directory), component_id='serialize', client=client) ) as pipeline: for path in Path(ns.input_directory).glob('**/*.source'): identifier = path.stem.split('-')[0] try: doc_concepts = concepts[identifier] except KeyError: continue with Event(event_id=identifier, client=client) as event: with path.open('r') as f: text = f.read() document = event.create_document('plaintext', text) with document.get_labeler('gold_concepts') as label_concept: for start, end, cui in doc_concepts: label_concept(start, end, cui=cui) pipeline.run(document)
def main(args=None): parser = ArgumentParser( description= 'Converts files from the i2b2/VA 2010 format to serialized MTAP events ' 'containing the ') parser.add_argument( 'input_directory', type=Path, help= 'An input directory containing a "txt" folder containing text files ' 'and an "ast" folder containing the assertions in the i2b2/VA ' 'pipe-delimited format.') parser.add_argument( 'output_directory', type=Path, help='An output directory to write the serialized mtap events to.') parser.add_argument('--target-document', default='plaintext') parser.add_argument('--serializer', default='pickle', choices=standard_serializers.keys(), help='The serializer to use.') parser.add_argument('--events', help="Address of the events client.") parser.add_argument('--tagger', help="Address of the pos tagger to use.") conf = parser.parse_args(args) serializer = standard_serializers[conf.serializer] with EventsClient(address=conf.events) as client, Pipeline( LocalProcessor(OnePerLineSentencesProcessor(), component_id='sentences', client=client), RemoteProcessor('biomedicus-tnt-tagger', address=conf.tagger), LocalProcessor(SerializationProcessor( serializer, output_dir=conf.output_directory), component_id='serializer', client=client)) as pipeline: results = pipeline.run_multithread( events(conf.input_directory, conf.target_document, client=client)) pipeline.print_times()
def test_time_result(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} client.instance_id = 0 with Pipeline( LocalProcessor(Processor(), component_id='test_processor'), events_client=client ) as pipeline: event = Event() result = pipeline.run(event) assert result.component_results[0].timing_info['process_method'] >= timedelta(seconds=0.001)
def run_pipeline(conf): pipeline_conf = conf.pipeline_config or Path( __file__).parent / 'examplePipelineConfiguration.yml' pipeline = Pipeline.from_yaml_file(pipeline_conf) with mtap.EventsClient(address=conf.events_address) as client: pipeline.append( LocalProcessor(proc=SerializationProcessor( ser=JsonSerializer, output_dir=conf.output_directory), client=client, component_id='serialization_processor')) source = FilesInDirectoryProcessingSource( directory=conf.input_directory, client=client) pipeline.run_multithread(source=source, workers=conf.threads, max_failures=conf.max_failures, read_ahead=conf.read_ahead)
def __init__(self, conf_path: Union[str, Path], output_directory: Union[str, Path], *, events_addresses: Optional[str] = None, serializer: Optional[str] = None, include_label_text: bool = False): self.pipeline = Pipeline.from_yaml_file(conf_path) if events_addresses is not None: self.pipeline.events_address = events_addresses if serializer == 'None': serializer = None if serializer is not None: serialization_proc = SerializationProcessor( get_serializer(serializer), output_directory, include_label_text=include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer') self.pipeline.append(ser_comp)
def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None): conf.populate_addresses() if events_client is not None: self.close_client = False self.events_client = events_client elif conf.events_address is not None: self.close_client = True self.events_client = EventsClient(address=conf.events_address) else: raise ValueError("Events client or address not specified.") pipeline = [(conf.sentences_id, conf.sentences_address), (conf.section_headers_id, conf.section_headers_address), (conf.tagger_id, conf.tagger_address), (conf.acronyms_id, conf.acronyms_address), (conf.concepts_id, conf.concepts_address), (conf.negation_id, conf.negation_address), (conf.selective_dependencies_id, conf.selective_dependencies_address), (conf.deepen_id, conf.deepen_address)] if conf.use_discovery: self.pipeline = Pipeline( *[RemoteProcessor(identifier) for identifier, _ in pipeline]) else: self.pipeline = Pipeline(*[ RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline ]) if conf.serializer is not None: serialization_proc = SerializationProcessor( get_serializer(conf.serializer), conf.output_directory, include_label_text=conf.include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp)
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--events") parser.add_argument("--rtf") parser.add_argument("--tagger") parser.add_argument("--acronyms") parser.add_argument("--sentences") args = parser.parse_args(args) input_dir = Path(args.input_directory) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('rtf-processor', address=args.rtf, params={ 'binary_data_name': 'rtf', 'output_document_name': 'plaintext' }), RemoteProcessor('sentences', address=args.sentences, params={'document_name': 'plaintext'}), RemoteProcessor('tnt-tagger', address=args.tagger, params={'document_name': 'plaintext'}), RemoteProcessor('acronyms', address=args.acronyms), LocalProcessor(SerializationProcessor( JsonSerializer, output_dir=args.output_directory), component_id='serialize', client=client)) as pipeline: for path in input_dir.glob("**/*.rtf"): with path.open('rb') as f: contents = f.read() with Event(event_id=path.stem, client=client) as event: event.binaries['rtf'] = contents pipeline.run(event) pipeline.print_times()
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_DIR', help='A folder containing PTB formatted documents.') parser.add_argument('output', metavar='OUTPUT_DIR', help='A folder to write the json files to.') parser.add_argument('--glob', metavar='GLOB', default='*.mrg') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--ptb-reader', metavar='READER', default=None, help='The address of the PTB Reader.') args = parser.parse_args(args) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('ptb-reader', address=args.ptb_reader, params={ 'source_document_name': 'source', 'target_document_name': 'gold', 'pos_tags_index': 'gold_tags' }), LocalProcessor(SerializationProcessor(JsonSerializer, output_dir=args.output), component_id='serializer', client=client)) as pipeline: for f in Path(args.input).rglob(args.glob): print('Reading:', f) with f.open('r') as r: text = r.read() with Event(event_id=f.name, client=client) as event: d = Document('source', text=text) event.add_document(d) pipeline.run(event)
def run_sentences_pipeline(input_directory, skip_file, output_directory): skip_documents = set(Path(skip_file).open('r').read().splitlines()) events_address = 'localhost:50100' with Pipeline(RemoteProcessor('biomedicus-sentences', address='localhost:50300'), LocalProcessor( SerializationProcessor(JsonSerializer, output_directory)), events_address=events_address) as pipeline: total = sum(1 for _ in input_directory.rglob('*.txt')) def source(): for path in input_directory.rglob('*.txt'): relative = str(path.relative_to(input_directory)) if relative not in skip_documents: with path.open('r') as f: txt = f.read() with Event(event_id=relative, client=pipeline.events_client, only_create_new=True) as e: doc = e.create_document('plaintext', txt) yield doc pipeline.run_multithread(source(), total=total, workers=8)