def build_doc(self, event): text = ' '.join(self.text) d = Document(document_name='plaintext', text=text) event.add_document(d) d.add_labels('pos_tags', self.tags, distinct=True) d.add_labels('sentences', self.sentences, distinct=True) return d
def process_document(self, document: Document, params: Dict[str, Any]): terms_index_name = params.get('terms_index', 'umls_terms') terms = document.labels[terms_index_name] negation_triggers = document.labels['negation_triggers'] all_deps = [] all_upos_tags = [] sentences = [] sentence_texts = [] for sentence in document.labels['sentences']: if len(terms.inside(sentence)) == 0 or len( negation_triggers.inside(sentence)) == 0: continue sentences.append(sentence) sentence_texts.append(sentence.text) stanza_doc = self.nlp(sentence_texts) for (sentence, stanza_sentence) in zip(sentences, stanza_doc.sentences): sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags( sentence, stanza_sentence) all_deps.extend(sentence_deps) all_upos_tags.extend(sentence_upos_tags) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def process_document(self, document: mtap.Document, params: Dict[str, Any]): referenced = [ mtap.GenericLabel(0, 1), mtap.GenericLabel(1, 2), mtap.GenericLabel(2, 3), mtap.GenericLabel(3, 4) ] # references can be a map of strings to labels with document.get_labeler('map_references') as label_map_references: label_map_references(0, 4, ref={ 'a': referenced[0], 'b': referenced[1], 'c': referenced[2], 'd': referenced[3] }) # references can be a list of labels with document.get_labeler('list_references') as label_list_references: label_list_references(0, 2, ref=[referenced[0], referenced[1]]) label_list_references(2, 3, ref=[referenced[2], referenced[3]]) # references can be direct with document.get_labeler('references') as label_references: label_references(0, 2, a=referenced[0], b=referenced[1]) label_references(2, 3, a=referenced[2], b=referenced[3]) # referenced labels don't need to be added via "addLabels" or "Labeler.close" before label # indices that reference them. # The Document will delay uploading any label indices to the server until they are. document.add_labels('referenced', referenced)
def copy_document(event: Event, source_document_name: str, target_document_name: str, index_names: Sequence[str] = ...): """Copies one document to another on the same event. Parameters ---------- event: Event The event. source_document_name: str The source document name. target_document_name: str The target document name. index_names: Sequence[str] If specified will only copy the specified label indices, by default all indices will be copied. """ source_document = event.documents[source_document_name] target_document = Document(target_document_name, text=source_document.text) event.add_document(target_document) if index_names is ...: info = source_document.get_label_indices_info() index_names = [i.index_name for i in info] for index_name in index_names: index = source_document.get_label_index(index_name) target_document.add_labels(index_name, index, distinct=index.distinct)
def process_document(self, document: Document, params: Dict[str, Any]): sentences = document.labels['sentences'] sentences_text = [] for sentence in sentences: sentences_text.append(sentence.text) stanza_doc = self.nlp(sentences_text) all_deps = [] all_upos_tags = [] for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences): dependencies = {} stanza_dependencies = stanza_sentence.dependencies stanza_dependencies = list(stanza_dependencies) i = 0 while len(stanza_dependencies) > 0: i += 1 if i > MAX_ITER: raise ValueError( 'Maximum Iterations reached while processing dependency graph.') head, deprel, dep = stanza_dependencies.pop() head_id = int(head.id) if head_id == 0: head_dep_label = None else: try: head_dep_label = dependencies[head_id] except KeyError: stanza_dependencies.insert(0, (head, deprel, dep)) continue token_begin = sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[ 0].start_char token_end = sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[ 0].start_char dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel) dep_label.reference_cache['dependents'] = [] dependencies[int(dep.id)] = dep_label if head_dep_label is not None: head_dep_label.dependents.append(dep_label) all_deps.append(dep_label) for word in stanza_sentence.words: token = word.parent token_begin = sentence.start_index + token.start_char - stanza_sentence.tokens[ 0].start_char token_end = sentence.start_index + token.end_char - stanza_sentence.tokens[ 0].start_char all_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos)) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def process_document(self, document: Document, params: Dict[str, Any]): sentences = document.labels['sentences'] sentences_text = [] for sentence in sentences: sentences_text.append(sentence.text) stanza_doc = self.nlp(sentences_text) all_deps = [] all_upos_tags = [] for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences): sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags( sentence, stanza_sentence) all_deps.extend(sentence_deps) all_upos_tags.extend(sentence_upos_tags) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def test_yml_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) one = label(start_index=0, end_index=5, x=10) two = label(start_index=6, end_index=10, x=15) document.add_labels('one', [one, two]) document.add_labels('two', [label(start_index=0, end_index=25, a='b', b=one), label(start_index=26, end_index=42, a='c', b=two)]) document.add_labels('three', [ label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: YamlSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) e = YamlSerializer.file_to_event(tf) assert e.event_id == event.event_id assert e.metadata['foo'] == 'bar' d = e.documents['plaintext'] assert d.text == document.text index_one = d.labels['one'] assert index_one == [one, two] index_two = d.labels['two'] assert index_two == [label(start_index=0, end_index=25, a='b', b=one), label(start_index=26, end_index=42, a='c', b=two)] index_three = d.labels['three'] assert index_three == [label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False)]
def test_add_labels_distinct(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_label_index_info.return_value = [] event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] document.add_labels('index', labels, distinct=True) client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=mocker.ANY) l2 = document.labels['index'] assert l2 == labels assert l2.distinct
def test_add_labels_not_distinct(mocker): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] l2 = document.add_labels('index', labels) client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=mocker.ANY) assert l2 == labels assert not l2.distinct
def test_yml_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) document.add_labels('one', [ label(start_index=0, end_index=5, x=10), label(start_index=6, end_index=10, x=15) ]) document.add_labels('two', [ label(start_index=0, end_index=25, a='b'), label(start_index=26, end_index=42, a='c') ]) document.add_labels('three', [ label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: YamlSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) o = yaml.load(tf, Loader=Loader) assert o['event_id'] == '1' assert o['metadata']['foo'] == 'bar' d = o['documents']['plaintext'] assert d['text'] == 'Some text.' assert len(d['label_indices']) == 3 assert d['label_indices']['one'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 5, 'x': 10 }, { 'start_index': 6, 'end_index': 10, 'x': 15 }], 'distinct': False } assert d['label_indices']['two'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 25, 'a': 'b' }, { 'start_index': 26, 'end_index': 42, 'a': 'c' }], 'distinct': False } assert d['label_indices']['three'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 10, 'foo': True }, { 'start_index': 11, 'end_index': 15, 'foo': False }], 'distinct': True }