def test_yml_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) one = label(start_index=0, end_index=5, x=10) two = label(start_index=6, end_index=10, x=15) document.add_labels('one', [one, two]) document.add_labels('two', [label(start_index=0, end_index=25, a='b', b=one), label(start_index=26, end_index=42, a='c', b=two)]) document.add_labels('three', [ label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: YamlSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) e = YamlSerializer.file_to_event(tf) assert e.event_id == event.event_id assert e.metadata['foo'] == 'bar' d = e.documents['plaintext'] assert d.text == document.text index_one = d.labels['one'] assert index_one == [one, two] index_two = d.labels['two'] assert index_two == [label(start_index=0, end_index=25, a='b', b=one), label(start_index=26, end_index=42, a='c', b=two)] index_three = d.labels['three'] assert index_three == [label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False)]
def copy_document(event: mtap.Event, source_document_name: str, target_document_name: str, index_names: typing.Sequence[str] = ...): """Copies one document to another on the same event. Parameters ---------- event: Event The event. source_document_name: str The source document name. target_document_name: str The target document name. index_names: Sequence[str] If specified will only copy the specified label indices, by default all indices will be copied. """ source_document = event.documents[source_document_name] target_document = mtap.Document(target_document_name, text=source_document.text) event.add_document(target_document) if index_names is ...: index_names = list(source_document.labels) for index_name in index_names: index = source_document.labels[index_name] target_document.add_labels(index_name, index, distinct=index.distinct)
def source(): for path in input_dir.rglob('*.txt'): with path.open('r', errors='replace') as f: txt = f.read() relative = str(path.relative_to(input_dir)) e = Event(event_id=relative, client=default_pipeline.events_client) doc = e.create_document('plaintext', txt) yield doc
def test_event_to_dict_include_label_text(): event = Event() doc = event.create_document('plaintext', text) doc.add_labels('sentences', [label(0, 117)]) doc.add_labels('tokens', [label(start, end) for start, end in tokens]) d_event = event_to_dict(event, include_label_text=True) d_doc = d_event['documents']['plaintext'] d_sentences = d_doc['label_indices']['sentences'] assert d_sentences['json_labels'][0]['_text'] == text d_tokens = d_doc['label_indices']['tokens']['json_labels'] for i, token in enumerate(d_tokens): assert token['_text'] == text[tokens[i][0]:tokens[i][1]]
def test_print_debug_all(): event = Event() doc = event.create_document( 'test', 'The quick brown fox jumps over the lazy dog.') with doc.get_labeler('target') as label_target: label_target(16, 19) with doc.get_labeler('tested') as label_tested: label_tested(10, 15) string_io = StringIO() metric = FirstTokenConfusion(print_debug='all', debug_handle=string_io) metric.update(doc, doc.labels['tested'], doc.labels['target']) assert string_io.getvalue( ) == 'False Positives\nThe quick {brown} fox jumps over the lazy dog.\n\nFalse Negatives\nThe quick brown {fox} jumps over the lazy dog.\n\n'
def test_run_concurrently_with_failure(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} client.instance_id = 0 with Pipeline( LocalProcessor(Processor('1', ), component_id='processor1'), LocalProcessor(Processor('2', ), component_id='processor2'), LocalProcessor(Processor('3', ), component_id='processor3'), events_client=client ) as pipeline: events = [Event(event_id=str(i), client=client) for i in range(7)] + [ Event(event_id='fail_' + str(i), client=client) for i in range(4)] with pytest.raises(ValueError) as e_info: pipeline.run_multithread(events, show_progress=False, max_failures=2)
def test_java_references(python_events, java_references_processor): with EventsClient(address=python_events) as client, Pipeline( RemoteProcessor('mtap-java-reference-labels-example-processor', address=java_references_processor) ) as pipeline: with Event(event_id='1', client=client) as event: document = event.create_document('plaintext', 'abcd') pipeline.run(document) references = document.labels['references'] assert references[0].a == GenericLabel(0, 1) assert references[0].b == GenericLabel(1, 2) assert references[1].a == GenericLabel(2, 3) assert references[1].b == GenericLabel(3, 4) map_references = document.labels['map_references'] assert map_references[0].ref == { 'a': GenericLabel(0, 1), 'b': GenericLabel(1, 2), 'c': GenericLabel(2, 3), 'd': GenericLabel(3, 4) } list_references = document.labels['list_references'] assert list_references[0].ref == [GenericLabel(0, 1), GenericLabel(1, 2)] assert list_references[1].ref == [GenericLabel(2, 3), GenericLabel(3, 4)]
def main(args=None): parser = ArgumentParser() parser.add_argument('--events-service') parser.add_argument('--sentences-service') parser.add_argument('--dependencies-service') parser.add_argument('input_file') conf = parser.parse_args(args) with EventsClient(address=conf.events_service) as client, \ Pipeline( RemoteProcessor('biomedicus-sentences', address=conf.sentences_service), RemoteProcessor('biomedicus-dependencies', address=conf.dependencies_service) ) as pipeline: with open(conf.input_file, 'r') as in_f: txt = in_f.read() with Event(event_id=Path(conf.input_file).name, client=client) as event: document = event.create_document('plaintext', txt) pipeline.run(document) for sentence in document.labels['sentences']: print(sentence.text) print('\n') for dependency in document.labels['dependencies'].inside( sentence): print((dependency.text, dependency.deprel, dependency.head.text if dependency.head is not None else 'ROOT')) print('\n')
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_FILE', help='The input GENIA XML file.') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--tnt-trainer', metavar='TRAINER', default=None, help='The address of the TnT trainer.') args = parser.parse_args(args) etree = ElementTree.parse(args.input) set = etree.getroot() with EventsClient(args.events) as client, Pipeline( RemoteProcessor('biomedicus-tnt-trainer', address=args.tnt_trainer)) as pipeline: for article in set.findall('article'): id = list(article.find('articleinfo'))[0].text with Event(id, client) as event: db = DocumentBuilder() for sentence in article.find('title').findall( 'sentence') + article.find('abstract').findall( 'sentence'): db.add_sentence(sentence) d = db.build_doc(event) pipeline.run(d)
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--events") parser.add_argument("--tagger") parser.add_argument("--sentences") parser.add_argument("--acronyms") parser.add_argument("--norms") parser.add_argument("--concepts") args = parser.parse_args(args) input_dir = Path(args.input_directory) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('biomedicus-sentences', address=args.sentences), RemoteProcessor('biomedicus-tnt-tagger', address=args.tagger), RemoteProcessor('biomedicus-acronyms', address=args.acronyms), RemoteProcessor('biomedicus-concepts', address=args.concepts), LocalProcessor(SerializationProcessor( JsonSerializer, output_dir=args.output_directory), component_id='serialize', client=client)) as pipeline: for path in input_dir.glob("**/*.txt"): print("READING FILE:", str(path)) with path.open('r') as f: contents = f.read() with Event(event_id=path.stem, client=client) as event: document = event.create_document("plaintext", text=contents) pipeline.run(document) pipeline.print_times()
def process(self, event: Event, params: Dict[str, Any]): self.seen += 1 if 'fail' in event.event_id: raise ValueError("fail") time.sleep(0.001) event.metadata[self.identifier] = 'True' event.metadata['processor'] = self.identifier self.processed += 1
def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult: with Event(event_id=event_id, client=self.events_client) as event: document = event.create_document('plaintext', text=text) f = self.pipeline.run(document) return f
def create_document(self): with Event(client=self.client) as e: document = e.create_document('plaintext', self.txt) document.add_labels('gold_dependencies', self.all_deps) document.add_labels('sentences', self.sentences) document.add_labels('pos_tags', self.pos_tags) document.add_labels('norm_forms', self.norms) yield document
def test_time_result(): processor = Processor() with Pipeline( LocalProcessor(processor, component_id='test_processor', client=None) ) as pipeline: event = Event() results = pipeline.run(event) result = results[0] assert result.timing_info['process_method'] >= timedelta(seconds=0.001)
def test_copy_document(): e = Event() doc = Document(document_name='first', text='The quick brown fox jumped over the lazy dog.') e.add_document(doc) with doc.get_labeler('some_index') as label: label(0, 3, word='The') label(4, 9, word='quick') label(10, 15, word='brown') processor = CopyDocument('first', 'second') processor.process(e, {}) second = e.documents['second'] assert second is not None assert second.labels['some_index'] == [ GenericLabel(0, 3, word='The'), GenericLabel(4, 9, word='quick'), GenericLabel(10, 15, word='brown') ]
def test_labeler_distinct_and_type_id_raises(mocker): with pytest.raises(ValueError): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document( document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) document.get_labeler('index', distinct=True, label_adapter=DistinctGenericLabelAdapter)
def source(): for path in input_directory.rglob('*.txt'): relative = str(path.relative_to(input_directory)) if relative not in skip_documents: with path.open('r') as f: txt = f.read() with Event(event_id=relative, client=pipeline.events_client, only_create_new=True) as e: doc = e.create_document('plaintext', txt) yield doc
def main(args=None): parser = ArgumentParser() parser.add_argument("input_directory", metavar="INPUT_DIR") parser.add_argument("concepts_csv", metavar="PATH_TO_CONCEPTS_CSV") parser.add_argument("output_directory", metavar="OUTPUT_DIR") parser.add_argument("--sentences") parser.add_argument("--tagger") parser.add_argument("--acronyms") parser.add_argument("--events") ns = parser.parse_args(args) print('Reading concepts csv...') concepts = {} with open(ns.concepts_csv, 'r') as f: for line in f.readlines(): splits = line.split(',') end = splits[0] start = splits[1] cui = splits[5] identifier = splits[6] try: v = concepts[identifier] except KeyError: v = [] concepts[identifier] = v v.append((start, end, cui)) print('Reading mipacq source files...') with EventsClient(address=ns.events) as client, \ Pipeline( RemoteProcessor('biomedicus-sentences', address=ns.sentences), RemoteProcessor('biomedicus-tnt-tagger', address=ns.tagger), RemoteProcessor('biomedicus-acronyms', address=ns.acronyms), LocalProcessor(SerializationProcessor(PickleSerializer, output_dir=ns.output_directory), component_id='serialize', client=client) ) as pipeline: for path in Path(ns.input_directory).glob('**/*.source'): identifier = path.stem.split('-')[0] try: doc_concepts = concepts[identifier] except KeyError: continue with Event(event_id=identifier, client=client) as event: with path.open('r') as f: text = f.read() document = event.create_document('plaintext', text) with document.get_labeler('gold_concepts') as label_concept: for start, end, cui in doc_concepts: label_concept(start, end, cui=cui) pipeline.run(document)
def rtf_source(input_directory: Path, extension_glob: str, events_client: EventsClient): input_directory = Path(input_directory) for path in input_directory.rglob(extension_glob): with path.open('rb', errors=None) as f: rtf = f.read() relative = str(path.relative_to(input_directory)) with Event(event_id=relative, client=events_client, only_create_new=True) as event: event.binaries['rtf'] = rtf yield event
def provide(self, consume: Callable[[Union[Document, Event]], None]): for i, path in enumerate(input_dir.rglob('*.txt'), start=1): if i > conf.limit: break with path.open('r', errors='replace') as f: txt = f.read() relative = str(path.relative_to(input_dir)) with Event(event_id=relative, client=default_pipeline.events_client, only_create_new=True) as e: doc = e.create_document('plaintext', txt) consume(doc)
def on_created(self, event: FileSystemEvent): if not event.is_directory: src_path = event.src_path() if fnmatch.fnmatch(src_path, self.extension_glob): path = Path(src_path) with path.open('rb', errors=None) as f: rtf = f.read() relative = str(path.relative_to(self.input_directory)) with Event(event_id=relative, client=self.events_client, only_create_new=True) as event: event.binaries['rtf'] = rtf self.consume(event)
def test_time_result(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} client.instance_id = 0 with Pipeline( LocalProcessor(Processor(), component_id='test_processor'), events_client=client ) as pipeline: event = Event() result = pipeline.run(event) assert result.component_results[0].timing_info['process_method'] >= timedelta(seconds=0.001)
def test_fields(): with Event(event_id='1') as event: doc = event.create_document('test', 'This is some text.') with doc.get_labeler('tested') as tested: tested(0, 5, x=1, y=3) tested(6, 10, x=3, y=4) with doc.get_labeler('target') as target: target(0, 5, x=1, y=5) target(6, 10, x=2, y=6) acc = Accuracy(fields=['x']) metrics = Metrics(acc, tested='tested', target='target') metrics.process_document(doc, params={}) assert abs(acc.value - 0.5) < 1e-6
def test_any(): with Event(event_id='1') as event: doc = event.create_document('test', 'This is some text.') with doc.get_labeler('tested') as tested: tested(0, 5, x=1) tested(0, 5, x=3) with doc.get_labeler('target') as target: target(0, 5, x=1) target(6, 10, x=2) acc = Accuracy(mode='any') metrics = Metrics(acc, tested='tested', target='target') metrics.process_document(doc, params={}) assert abs(acc.value - 0.5) < 1e-6
def on_created(self, event: FileSystemEvent): if not event.is_directory: src_path = event.src_path if fnmatch.fnmatch(src_path, self.extension_glob): print('Processing: ' + src_path) path = Path(src_path) with path.open('r', errors=None) as f: txt = f.read() relative = str(path.relative_to(self.input_directory)) with Event(event_id=relative, client=self.events_client, only_create_new=True) as e: doc = e.create_document(self.document_name, txt) self.consume(doc)
def test_run_concurrently(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} client.instance_id = 0 with Pipeline( LocalProcessor(Processor('1', ), component_id='processor1'), LocalProcessor(Processor('2', ), component_id='processor2'), LocalProcessor(Processor('3', ), component_id='processor3'), events_client=client ) as pipeline: pipeline.events_client = client events = [Event() for _ in range(10)] pipeline.run_multithread(events, show_progress=False)
def test_run_multi(mocker): client = mocker.Mock(EventsClient) client.get_all_document_names.return_value = ['plaintext'] client.get_all_metadata.return_value = {} processor1 = Processor('1') processor2 = Processor('2') processor3 = Processor('3') with Pipeline( LocalProcessor(processor1, component_id='processor1', client=client), LocalProcessor(processor2, component_id='processor2', client=client), LocalProcessor(processor3, component_id='processor3', client=client) ) as pipeline: events = [Event() for _ in range(10)] results = pipeline.run_multithread(events, progress=False) for result in results: assert len(result) == 3
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_FOLDER', help='A folder containing PTB formatted documents.') parser.add_argument('--glob', metavar='GLOB', default='*.mrg') parser.add_argument('--source-name', metavar='DOCUMENT_NAME', default='source', help='What document to dump the PTB text into.') parser.add_argument( '--target-name', metavar='DOCUMENT_NAME', default='plaintext', help='What document to the plaintext and annotations into.') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--ptb-reader', metavar='READER', default=None, help='The address of the PTB Reader.') parser.add_argument('--tnt-trainer', metavar='TRAINER', default=None, help='The address of the TnT trainer.') args = parser.parse_args(args) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('ptb-reader', address=args.ptb_reader, params={ 'source_document_name': args.source_name, 'target_document_name': args.target_name }), RemoteProcessor('biomedicus-tnt-trainer', address=args.tnt_trainer, params={'document_name': args.target_name})) as pipeline: for f in Path(args.input).rglob(args.glob): print('Reading:', f) with f.open('r') as r: text = r.read() with Event(event_id=f.name, client=client) as event: d = Document(args.source_name, text=text) event.add_document(d) pipeline.run(event)
def test_begin_token_precision_recall_f1(): with Event() as event: doc = event.create_document( 'test', 'The quick brown fox jumps over the lazy dog.') with doc.get_labeler('tested') as label_tested: label_tested(0, 9) label_tested(10, 19) label_tested(20, 44) with doc.get_labeler('target') as label_target: label_target(0, 19) label_target(20, 30) label_target(31, 44) metric = FirstTokenConfusion() metric.update(doc, doc.labels['tested'], doc.labels['target']) assert metric.precision == 2 / 3 assert metric.recall == 2 / 3 assert metric.f1 == 2 / 3
def main(args=None): parser = ArgumentParser() parser.add_argument('--events-service', default='localhost:10100') parser.add_argument('--sentences-service', default='localhost:10102') conf = parser.parse_args(args) with Pipeline( RemoteProcessor( 'biomedicus-sentences', address=conf.sentences_service)) as pipeline, EventsClient( address=conf.events_service) as events_client: text = sys.stdin.read() with Event(client=events_client) as event: doc = event.create_document('plaintext', text) result = pipeline.run(doc) for sentence in doc.get_label_index('sentences'): print('S: "', sentence.text, '"') for k, v in result[0].timing_info.items(): print('{}: {}'.format(k, v))