Beispiel #1
0
def document_to_dict(document: Document,
                     *,
                     include_label_text: bool = False) -> Dict:
    """A helper method that turns a document into a python dictionary.

    Args:
        document (Document): The document object.

    Keyword Args:
        include_label_text (bool): Whether to include the text labels cover with the labels.

    Returns:
        dict: A dictionary object suitable for serialization.
    """
    d = {'text': document.text, 'label_indices': {}}

    for index_info in document.get_label_indices_info():
        if index_info.type == LabelIndexType.OTHER or index_info.type == LabelIndexType.UNKNOWN:
            logger.warning(
                'Index {} of type {} will not be included in serialization.'.
                format(index_info.index_name, index_info.type.name))
            continue
        d['label_indices'][index_info.index_name] = label_index_to_dict(
            document.get_label_index(index_info.index_name),
            include_label_text=include_label_text)
    return d
Beispiel #2
0
 def process_document(self, document: Document,
                      params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
     tested = document.get_label_index(self.tested)
     if self.tested_filter is not None:
         tested = tested.filter(self.tested_filter)
     target = document.get_label_index(self.target)
     if self.target_filter is not None:
         target = target.filter(self.target_filter)
     local = {}
     for metric in self.metrics:
         local[metric.name] = metric.update(document, tested, target)
     return local
Beispiel #3
0
def ann_to_labels(ann_file: Union[str, Path],
                  document: Document,
                  label_index_name_prefix: str,
                  encoding: Optional[str],
                  create_indices: Optional[Iterable[str]] = None):
    """Reads all of the annotations in a brat annotations file into a document.

    Parameters
    ----------
    ann_file: str or Path
        A BRAT .ann file to load annotations from.
    document: Document
        The document to add labels to.
    label_index_name_prefix: str
        A prefix to append to the brat annotation names.
    encoding: optional str
        The encoding to use when reading the ann file.
    create_indices: optional iterable of str
        These indices will be created no matter what, even if empty.
    """
    labelers = {}
    if create_indices is not None:
        for index in create_indices:
            labelers[index] = document.get_labeler(index)
    ann_file = Path(ann_file)
    with ann_file.open('r', encoding=encoding) as f:
        for line in f.readlines():
            splits = line.split('\t')
            if len(splits) < 3 or not splits[0].startswith('T'):
                continue
            name, bounds = splits[1].split(' ', maxsplit=1)
            name = label_index_name_prefix + name
            bounds = bounds.split(';')
            min_start = float('Inf')
            max_end = 0
            for pair in bounds:
                start_index, end_index = pair.split(' ')
                min_start = min(min_start, int(start_index))
                max_end = max(max_end, int(end_index))
            try:
                labeler = labelers[name]
            except KeyError:
                labeler = document.get_labeler(name)
                labelers[name] = labeler
            labeler(min_start, max_end)
    for labeler in labelers.values():
        labeler.done()
Beispiel #4
0
def test_copy_document():
    e = Event()
    doc = Document(document_name='first', text='The quick brown fox jumped over the lazy dog.')
    e.add_document(doc)
    with doc.get_labeler('some_index') as label:
        label(0, 3, word='The')
        label(4, 9, word='quick')
        label(10, 15, word='brown')
    processor = CopyDocument('first', 'second')
    processor.process(e, {})
    second = e.documents['second']
    assert second is not None
    assert second.get_label_index('some_index') == [
        GenericLabel(0, 3, word='The'),
        GenericLabel(4, 9, word='quick'),
        GenericLabel(10, 15, word='brown')
    ]
Beispiel #5
0
def test_add_document_no_client():
    event = Event(event_id='1')
    document = Document('plaintext',
                        text="“You're no help,” he told the lime. "
                        "This was unfair. It was only a lime; "
                        "there was nothing special about it at all. "
                        "It was doing the best it could.")
    event.add_document(document)
    assert event.documents['plaintext'] == document
Beispiel #6
0
def dict_to_document(document_name: str,
                     d: Dict,
                     *,
                     event: Optional[Event] = None) -> Document:
    """Turns a serialized dictionary into a Document.

    Args:
        document_name (str): The name identifier of the document on the event.
        d (dict): The dictionary representation of the document.
        event (~typing.Optional[Event]): An event that the document should be added to.

    Returns:
        Document: The deserialized Document object.

    """
    document = Document(document_name=document_name, text=d['text'])
    if event is not None:
        event.add_document(document)
    for k, v in d['label_indices'].items():
        index = dict_to_label_index(d=v)
        document.add_labels(k, index, distinct=index.distinct)

    return document
Beispiel #7
0
def test_add_document(mocker):
    client = mocker.Mock(EventsClient)
    event = Event(event_id='1', client=client)
    document = Document('plaintext',
                        text="“You're no help,” he told the lime. "
                        "This was unfair. It was only a lime; "
                        "there was nothing special about it at all. "
                        "It was doing the best it could.")
    event.add_document(document)
    assert event.documents['plaintext'] == document
    client.add_document.assert_called_once_with(
        '1', 'plaintext', "“You're no help,” he told the lime. "
        "This was unfair. It was only a lime; "
        "there was nothing special about it at all. "
        "It was doing the best it could.")
Beispiel #8
0
    def process_document(self, document: Document,
                         params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        if params['do_work']:
            with self.started_stopwatch('fetch_time'):
                text = document.text

            a_count = text.count('a')
            b_count = text.count('b')

            with document.get_labeler(
                    'mtap.examples.letter_counts') as label_letter_count:
                label_letter_count(start_index=0,
                                   end_index=len(document.text),
                                   letter='a',
                                   count=a_count)
                label_letter_count(start_index=0,
                                   end_index=len(document.text),
                                   letter='b',
                                   count=b_count)

        return {'answer': 42}
Beispiel #9
0
def test_json_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    document.add_labels('one', [
        mtap.GenericLabel(start_index=0, end_index=5, x=10),
        mtap.GenericLabel(start_index=6, end_index=10, x=15)
    ])
    document.add_labels('two', [
        mtap.GenericLabel(start_index=0, end_index=25, a='b'),
        mtap.GenericLabel(start_index=26, end_index=42, a='c')
    ])
    document.add_labels('three', [
        mtap.GenericLabel(start_index=0, end_index=10, foo=True),
        mtap.GenericLabel(start_index=11, end_index=15, foo=False)
    ],
                        distinct=True)

    with TemporaryFile('w+') as tf:
        JsonSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        o = json.load(tf)

    assert o['event_id'] == '1'
    assert o['metadata']['foo'] == 'bar'
    d = o['documents']['plaintext']
    assert d['text'] == 'Some text.'
    assert len(d['label_indices']) == 3
    assert d['label_indices']['one'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 5,
            'x': 10
        }, {
            'start_index': 6,
            'end_index': 10,
            'x': 15
        }],
        'distinct':
        False
    }
    assert d['label_indices']['two'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 25,
            'a': 'b'
        }, {
            'start_index': 26,
            'end_index': 42,
            'a': 'c'
        }],
        'distinct':
        False
    }
    assert d['label_indices']['three'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 10,
            'foo': True
        }, {
            'start_index': 11,
            'end_index': 15,
            'foo': False
        }],
        'distinct':
        True
    }