Ejemplo n.º 1
0
 def build_doc(self, event):
     text = ' '.join(self.text)
     d = Document(document_name='plaintext', text=text)
     event.add_document(d)
     d.add_labels('pos_tags', self.tags, distinct=True)
     d.add_labels('sentences', self.sentences, distinct=True)
     return d
Ejemplo n.º 2
0
    def process_document(self, document: Document, params: Dict[str, Any]):
        terms_index_name = params.get('terms_index', 'umls_terms')
        terms = document.labels[terms_index_name]
        negation_triggers = document.labels['negation_triggers']

        all_deps = []
        all_upos_tags = []
        sentences = []
        sentence_texts = []
        for sentence in document.labels['sentences']:
            if len(terms.inside(sentence)) == 0 or len(
                    negation_triggers.inside(sentence)) == 0:
                continue
            sentences.append(sentence)
            sentence_texts.append(sentence.text)

        stanza_doc = self.nlp(sentence_texts)
        for (sentence, stanza_sentence) in zip(sentences,
                                               stanza_doc.sentences):
            sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(
                sentence, stanza_sentence)
            all_deps.extend(sentence_deps)
            all_upos_tags.extend(sentence_upos_tags)

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Ejemplo n.º 3
0
    def process_document(self, document: mtap.Document, params: Dict[str,
                                                                     Any]):
        referenced = [
            mtap.GenericLabel(0, 1),
            mtap.GenericLabel(1, 2),
            mtap.GenericLabel(2, 3),
            mtap.GenericLabel(3, 4)
        ]

        # references can be a map of strings to labels
        with document.get_labeler('map_references') as label_map_references:
            label_map_references(0,
                                 4,
                                 ref={
                                     'a': referenced[0],
                                     'b': referenced[1],
                                     'c': referenced[2],
                                     'd': referenced[3]
                                 })

        # references can be a list of labels
        with document.get_labeler('list_references') as label_list_references:
            label_list_references(0, 2, ref=[referenced[0], referenced[1]])
            label_list_references(2, 3, ref=[referenced[2], referenced[3]])

        # references can be direct
        with document.get_labeler('references') as label_references:
            label_references(0, 2, a=referenced[0], b=referenced[1])
            label_references(2, 3, a=referenced[2], b=referenced[3])

        # referenced labels don't need to be added via "addLabels" or "Labeler.close" before label
        # indices that reference them.
        # The Document will delay uploading any label indices to the server until they are.
        document.add_labels('referenced', referenced)
Ejemplo n.º 4
0
def copy_document(event: Event,
                  source_document_name: str,
                  target_document_name: str,
                  index_names: Sequence[str] = ...):
    """Copies one document to another on the same event.

    Parameters
    ----------
    event: Event
        The event.
    source_document_name: str
        The source document name.
    target_document_name: str
        The target document name.
    index_names: Sequence[str]
        If specified will only copy the specified label indices, by default all indices will be
        copied.
    """
    source_document = event.documents[source_document_name]
    target_document = Document(target_document_name, text=source_document.text)
    event.add_document(target_document)
    if index_names is ...:
        info = source_document.get_label_indices_info()
        index_names = [i.index_name for i in info]
    for index_name in index_names:
        index = source_document.get_label_index(index_name)
        target_document.add_labels(index_name, index, distinct=index.distinct)
Ejemplo n.º 5
0
    def process_document(self,
                         document: Document,
                         params: Dict[str, Any]):
        sentences = document.labels['sentences']

        sentences_text = []
        for sentence in sentences:
            sentences_text.append(sentence.text)

        stanza_doc = self.nlp(sentences_text)

        all_deps = []
        all_upos_tags = []
        for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences):
            dependencies = {}
            stanza_dependencies = stanza_sentence.dependencies
            stanza_dependencies = list(stanza_dependencies)
            i = 0
            while len(stanza_dependencies) > 0:
                i += 1
                if i > MAX_ITER:
                    raise ValueError(
                        'Maximum Iterations reached while processing dependency graph.')
                head, deprel, dep = stanza_dependencies.pop()
                head_id = int(head.id)
                if head_id == 0:
                    head_dep_label = None
                else:
                    try:
                        head_dep_label = dependencies[head_id]
                    except KeyError:
                        stanza_dependencies.insert(0, (head, deprel, dep))
                        continue

                token_begin = sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[
                    0].start_char
                token_end = sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[
                    0].start_char
                dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel)
                dep_label.reference_cache['dependents'] = []
                dependencies[int(dep.id)] = dep_label
                if head_dep_label is not None:
                    head_dep_label.dependents.append(dep_label)
                all_deps.append(dep_label)

            for word in stanza_sentence.words:
                token = word.parent
                token_begin = sentence.start_index + token.start_char - stanza_sentence.tokens[
                    0].start_char
                token_end = sentence.start_index + token.end_char - stanza_sentence.tokens[
                    0].start_char
                all_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Ejemplo n.º 6
0
    def process_document(self, document: Document, params: Dict[str, Any]):
        sentences = document.labels['sentences']

        sentences_text = []
        for sentence in sentences:
            sentences_text.append(sentence.text)

        stanza_doc = self.nlp(sentences_text)

        all_deps = []
        all_upos_tags = []
        for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences):
            sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(
                sentence, stanza_sentence)
            all_deps.extend(sentence_deps)
            all_upos_tags.extend(sentence_upos_tags)

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Ejemplo n.º 7
0
def test_yml_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    one = label(start_index=0, end_index=5, x=10)
    two = label(start_index=6, end_index=10, x=15)
    document.add_labels('one', [one,
                                two])
    document.add_labels('two', [label(start_index=0, end_index=25, a='b', b=one),
                                label(start_index=26, end_index=42, a='c', b=two)])
    document.add_labels('three', [
        label(start_index=0, end_index=10, foo=True),
        label(start_index=11, end_index=15, foo=False)
    ], distinct=True)

    with TemporaryFile('w+') as tf:
        YamlSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        e = YamlSerializer.file_to_event(tf)

    assert e.event_id == event.event_id
    assert e.metadata['foo'] == 'bar'
    d = e.documents['plaintext']
    assert d.text == document.text
    index_one = d.labels['one']
    assert index_one == [one, two]
    index_two = d.labels['two']
    assert index_two == [label(start_index=0, end_index=25, a='b', b=one),
                         label(start_index=26, end_index=42, a='c', b=two)]
    index_three = d.labels['three']
    assert index_three == [label(start_index=0, end_index=10, foo=True),
                           label(start_index=11, end_index=15, foo=False)]
Ejemplo n.º 8
0
def test_add_labels_distinct(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_label_index_info.return_value = []
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    document.add_labels('index', labels, distinct=True)
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=mocker.ANY)
    l2 = document.labels['index']
    assert l2 == labels
    assert l2.distinct
Ejemplo n.º 9
0
def test_add_labels_not_distinct(mocker):
    client = mocker.Mock(EventsClient)
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    l2 = document.add_labels('index', labels)
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=mocker.ANY)
    assert l2 == labels
    assert not l2.distinct
Ejemplo n.º 10
0
def test_yml_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    document.add_labels('one', [
        label(start_index=0, end_index=5, x=10),
        label(start_index=6, end_index=10, x=15)
    ])
    document.add_labels('two', [
        label(start_index=0, end_index=25, a='b'),
        label(start_index=26, end_index=42, a='c')
    ])
    document.add_labels('three', [
        label(start_index=0, end_index=10, foo=True),
        label(start_index=11, end_index=15, foo=False)
    ],
                        distinct=True)

    with TemporaryFile('w+') as tf:
        YamlSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        o = yaml.load(tf, Loader=Loader)

    assert o['event_id'] == '1'
    assert o['metadata']['foo'] == 'bar'
    d = o['documents']['plaintext']
    assert d['text'] == 'Some text.'
    assert len(d['label_indices']) == 3
    assert d['label_indices']['one'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 5,
            'x': 10
        }, {
            'start_index': 6,
            'end_index': 10,
            'x': 15
        }],
        'distinct':
        False
    }
    assert d['label_indices']['two'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 25,
            'a': 'b'
        }, {
            'start_index': 26,
            'end_index': 42,
            'a': 'c'
        }],
        'distinct':
        False
    }
    assert d['label_indices']['three'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 10,
            'foo': True
        }, {
            'start_index': 11,
            'end_index': 15,
            'foo': False
        }],
        'distinct':
        True
    }