def document_to_dict(document: Document, *, include_label_text: bool = False) -> Dict: """A helper method that turns a document into a python dictionary. Args: document (Document): The document object. Keyword Args: include_label_text (bool): Whether to include the text labels cover with the labels. Returns: dict: A dictionary object suitable for serialization. """ d = {'text': document.text, 'label_indices': {}} for index_info in document.get_label_indices_info(): if index_info.type == LabelIndexType.OTHER or index_info.type == LabelIndexType.UNKNOWN: logger.warning( 'Index {} of type {} will not be included in serialization.'. format(index_info.index_name, index_info.type.name)) continue d['label_indices'][index_info.index_name] = label_index_to_dict( document.get_label_index(index_info.index_name), include_label_text=include_label_text) return d
def process_document(self, document: Document, params: Dict[str, Any]) -> Optional[Dict[str, Any]]: tested = document.get_label_index(self.tested) if self.tested_filter is not None: tested = tested.filter(self.tested_filter) target = document.get_label_index(self.target) if self.target_filter is not None: target = target.filter(self.target_filter) local = {} for metric in self.metrics: local[metric.name] = metric.update(document, tested, target) return local
def ann_to_labels(ann_file: Union[str, Path], document: Document, label_index_name_prefix: str, encoding: Optional[str], create_indices: Optional[Iterable[str]] = None): """Reads all of the annotations in a brat annotations file into a document. Parameters ---------- ann_file: str or Path A BRAT .ann file to load annotations from. document: Document The document to add labels to. label_index_name_prefix: str A prefix to append to the brat annotation names. encoding: optional str The encoding to use when reading the ann file. create_indices: optional iterable of str These indices will be created no matter what, even if empty. """ labelers = {} if create_indices is not None: for index in create_indices: labelers[index] = document.get_labeler(index) ann_file = Path(ann_file) with ann_file.open('r', encoding=encoding) as f: for line in f.readlines(): splits = line.split('\t') if len(splits) < 3 or not splits[0].startswith('T'): continue name, bounds = splits[1].split(' ', maxsplit=1) name = label_index_name_prefix + name bounds = bounds.split(';') min_start = float('Inf') max_end = 0 for pair in bounds: start_index, end_index = pair.split(' ') min_start = min(min_start, int(start_index)) max_end = max(max_end, int(end_index)) try: labeler = labelers[name] except KeyError: labeler = document.get_labeler(name) labelers[name] = labeler labeler(min_start, max_end) for labeler in labelers.values(): labeler.done()
def test_copy_document(): e = Event() doc = Document(document_name='first', text='The quick brown fox jumped over the lazy dog.') e.add_document(doc) with doc.get_labeler('some_index') as label: label(0, 3, word='The') label(4, 9, word='quick') label(10, 15, word='brown') processor = CopyDocument('first', 'second') processor.process(e, {}) second = e.documents['second'] assert second is not None assert second.get_label_index('some_index') == [ GenericLabel(0, 3, word='The'), GenericLabel(4, 9, word='quick'), GenericLabel(10, 15, word='brown') ]
def test_add_document_no_client(): event = Event(event_id='1') document = Document('plaintext', text="“You're no help,” he told the lime. " "This was unfair. It was only a lime; " "there was nothing special about it at all. " "It was doing the best it could.") event.add_document(document) assert event.documents['plaintext'] == document
def dict_to_document(document_name: str, d: Dict, *, event: Optional[Event] = None) -> Document: """Turns a serialized dictionary into a Document. Args: document_name (str): The name identifier of the document on the event. d (dict): The dictionary representation of the document. event (~typing.Optional[Event]): An event that the document should be added to. Returns: Document: The deserialized Document object. """ document = Document(document_name=document_name, text=d['text']) if event is not None: event.add_document(document) for k, v in d['label_indices'].items(): index = dict_to_label_index(d=v) document.add_labels(k, index, distinct=index.distinct) return document
def test_add_document(mocker): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document('plaintext', text="“You're no help,” he told the lime. " "This was unfair. It was only a lime; " "there was nothing special about it at all. " "It was doing the best it could.") event.add_document(document) assert event.documents['plaintext'] == document client.add_document.assert_called_once_with( '1', 'plaintext', "“You're no help,” he told the lime. " "This was unfair. It was only a lime; " "there was nothing special about it at all. " "It was doing the best it could.")
def process_document(self, document: Document, params: Dict[str, Any]) -> Optional[Dict[str, Any]]: if params['do_work']: with self.started_stopwatch('fetch_time'): text = document.text a_count = text.count('a') b_count = text.count('b') with document.get_labeler( 'mtap.examples.letter_counts') as label_letter_count: label_letter_count(start_index=0, end_index=len(document.text), letter='a', count=a_count) label_letter_count(start_index=0, end_index=len(document.text), letter='b', count=b_count) return {'answer': 42}
def test_json_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) document.add_labels('one', [ mtap.GenericLabel(start_index=0, end_index=5, x=10), mtap.GenericLabel(start_index=6, end_index=10, x=15) ]) document.add_labels('two', [ mtap.GenericLabel(start_index=0, end_index=25, a='b'), mtap.GenericLabel(start_index=26, end_index=42, a='c') ]) document.add_labels('three', [ mtap.GenericLabel(start_index=0, end_index=10, foo=True), mtap.GenericLabel(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: JsonSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) o = json.load(tf) assert o['event_id'] == '1' assert o['metadata']['foo'] == 'bar' d = o['documents']['plaintext'] assert d['text'] == 'Some text.' assert len(d['label_indices']) == 3 assert d['label_indices']['one'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 5, 'x': 10 }, { 'start_index': 6, 'end_index': 10, 'x': 15 }], 'distinct': False } assert d['label_indices']['two'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 25, 'a': 'b' }, { 'start_index': 26, 'end_index': 42, 'a': 'c' }], 'distinct': False } assert d['label_indices']['three'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 10, 'foo': True }, { 'start_index': 11, 'end_index': 15, 'foo': False }], 'distinct': True }