def test_crf(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document = AnnotatedDocument(content, annotations=annotations) # Train. mod = CRF() mod.fit([annotated_document]) # Predict. Works! content = b"The quick brown fox." document = Document(content) ann = mod.transform([document]) # 2 annotations, brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "brown") assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[0].offset, (10, 14)) assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].label, "ANIMAL") assert_equal(ann[0].annotations[1].offset, (16, 18))
def test_transform_annotated_documents_to_bio_format(): # Test no annotations. content = b"The quick brown fox jumps over the lazy dog." annotated_document = AnnotatedDocument(content, annotations=None) expected = ([[ 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.' ]], [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]) transformed = transform_annotated_documents_to_bio_format( [annotated_document]) assert_equal(transformed, expected) annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document = AnnotatedDocument(content, annotations=annotations) expected = ([[ 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.' ]], [[ 'O', 'O', 'B_COLOR', 'B_ANIMAL', 'O', 'O', 'O', 'O', 'B_ANIMAL', 'O' ]]) transformed = transform_annotated_documents_to_bio_format( [annotated_document]) assert_equal(transformed, expected)
def test_crf_multi_term(): content = b"The dark brown fox jumps over the lazy dog magnificently." annotations = [ Annotation("dark brown", "COLOR", (4, 13)), Annotation("fox", "ANIMAL", (15, 17)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document = AnnotatedDocument(content, annotations=annotations) # Train. mod = CRF() mod.fit([annotated_document]) # Predict. Works! content = b"The dark brown fox." document = Document(content) ann = mod.transform([document]) # 2 annotations, dark brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "dark brown") assert_equal(ann[0].annotations[0].offset, (4, 13)) assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].offset, (15, 17)) assert_equal(ann[0].annotations[1].label, "ANIMAL")
def test_comparison_operators(): annotation_1 = Annotation("pizza", "FOOD", (1, 6)) annotation_2 = Annotation("pizza", "FOOD", (1, 6)) annotation_3 = Annotation("cupcake", "FOOD", (9, 16)) annotation_4 = Annotation("milk", "FOOD", (7, 11)) assert_true(annotation_3 > annotation_1) assert_true(annotation_4 < annotation_3) assert_true(annotation_3 >= annotation_1) assert_true(annotation_4 <= annotation_3) assert_true(annotation_1 == annotation_2)
def test_optimizer(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10))] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38))] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) # Test it with CRF because it's the least time-consuming one to train. crf = CRF() hparams = { "c1": ExactListParam([0.1, 0.9]), "c2": ExactListParam([0.1, 0.9]) } optimizer = Optimizer(crf, hparams, "COLOR", cv=3) best_estimator, f1score = optimizer.optimize_and_return_best([ annotated_document_1, annotated_document_2, annotated_document_3 ]) assert_greater_equal(f1score, 0.5)
def test_kfold_cv(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10)) ] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38)) ] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) # Test it with CRF because it's the least time-consuming one to train. crf = CRF() kfold = KFoldCV(crf, k=3) average_f1 = kfold.cross_validate( [annotated_document_1, annotated_document_2, annotated_document_3], {"max_iterations": 100}) # The examples and k are selected in a way where this always happens. assert_equal(average_f1, 0.5)
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. In a dictionary based approach, a dictionary of keywords is used to create a FSA which is then used to search with. See [1]. [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm """ annotated_documents = [] for document in X: annotations = [] doc_content_str = document.plain_text_ for item in self.automaton.iter(doc_content_str): end_position, (index, word) = item start_position = (end_position - len(word) + 1) end_position = end_position + 1 annotations.append( Annotation(word, self.entity_label, (start_position, end_position))) annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def test_transform_to_spacy_format(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document = AnnotatedDocument(content, annotations=annotations) expected = [("The quick brown fox jumps over the lazy dog.", { "entities": [(10, 15, "COLOR"), (16, 19, "ANIMAL"), (40, 43, "ANIMAL")] })] model = SpaCyStatisticalNER() transformed = model._transform_to_spacy_format([annotated_document]) assert_equal(transformed, expected)
def test_ExactMatchMultiClassDictionaryNER2(): documents = [ AnnotatedDocument(b""" In this study , we have used the polymerase chain reaction ( PCR ) with nested primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils from female patients with eosinophilia . """, annotations= [ Annotation("HUMARA loci", "DNA", (139, 150)), Annotation("purified eosinophils", "cell-type", (154, 174)) ])] ner = ExactMatchMultiClassDictionaryNER( "nerds/test/data/dictionary/biodictionary.txt") ner.fit(documents) pred_documents = ner.transform(documents) for i, annotation in enumerate(pred_documents[0].annotations): pred_text = annotation.text pred_offsets = annotation.offset label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]] assert_equal(pred_text, label_text, "predicted {:s} != label {:s}".format(pred_text, label_text)) assert_equal(annotation.label, expected_labels[i])
def test_spacy(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10)) ] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38)) ] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) data = [annotated_document_1, annotated_document_2, annotated_document_3] # Train. model = SpaCyStatisticalNER() model.fit(data, num_epochs=5) # Predict. Works! content = b"The quick brown fox." document = Document(content) ann = model.transform([document]) # 2 annotations, brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "brown") assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[0].offset, (10, 14)) assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].label, "ANIMAL") assert_equal(ann[0].annotations[1].offset, (16, 18)) entities = model.extract([document]) assert_equal(len(entities[0]), 2)
def test_spacy_multi_term(): content = b"The quick dark brown fox jumps over the lazy dog." annotations = [ Annotation("dark brown", "COLOR", (10, 19)), Annotation("fox", "ANIMAL", (21, 23)), Annotation("dog", "ANIMAL", (45, 47)) ] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A dark brown fox jumps quickly." annotations = [ Annotation("dark brown", "COLOR", (2, 11)), Annotation("fox", "ANIMAL", (13, 15)) ] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was dark brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("dark brown", "COLOR", (17, 26)), Annotation("dog", "ANIMAL", (41, 43)) ] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) data = [annotated_document_1, annotated_document_2, annotated_document_3] # Train. model = SpaCyStatisticalNER() model.fit(data, num_epochs=5) # Predict. Works! content = b"The dark brown fox." document = Document(content) ann = model.transform([document]) # 2 annotations, dark brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "dark brown") assert_equal(ann[0].annotations[0].offset, (4, 13)) assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].offset, (15, 17)) assert_equal(ann[0].annotations[1].label, "ANIMAL") entities = model.extract([document]) assert_equal(len(entities[0]), 2)
def test_ner_ensemble_configuration(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10))] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38))] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) # In that config file we have params for a CRF with c1 = 0.1 and c2 = 0.1. ner_ensemble_config = NERModelEnsembleConfiguration( "nerds/test/data/config/sample.yaml") ner_ensemble_config.fit([ annotated_document_1, annotated_document_2, annotated_document_3]) # Predict. Works! content = b"The quick brown fox." document = Document(content) ann = ner_ensemble_config.transform([document]) # 2 annotations, brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "brown") assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[0].offset, (10, 14)) assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].label, "ANIMAL") assert_equal(ann[0].annotations[1].offset, (16, 18))
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. """ annotated_documents = [] for document in X: annotated_document = self.nlp(document.plain_text_) annotations = [] for named_entity in annotated_document.ents: annotations.append( Annotation( named_entity.text, named_entity.label_, (named_entity.start_char, named_entity.end_char - 1))) annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def test_ensemble_pooling(): ensemble = NERModelEnsemblePooling([NERModel(), NERModel(), NERModel()]) x1 = Annotation("pizza", "FOOD", (1, 6)) x2 = Annotation("milk", "FOOD", (7, 11)) x3 = Annotation("cupcake", "FOOD", (12, 19)) x4 = Annotation("kebab", "FOOD", (20, 25)) x5 = Annotation("pie", "FOOD", (29, 32)) x6 = Annotation("cheese", "FOOD", (35, 41)) entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]] assert_equal(ensemble.vote(entity_matrix), [x1, x2, x3, x4, x5, x6])
def test_ensemble_majority_vote(): ensemble = NERModelEnsembleMajorityVote( [NERModel(), NERModel(), NERModel()]) x1 = Annotation("pizza", "FOOD", (1, 6)) x2 = Annotation("milk", "FOOD", (7, 11)) x3 = Annotation("cupcake", "FOOD", (12, 19)) x4 = Annotation("kebab", "FOOD", (20, 25)) x5 = Annotation("pie", "FOOD", (29, 32)) x6 = Annotation("cheese", "FOOD", (35, 41)) entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]] # Majority vote: 2 out of 3 classifiers voted for x1, x2 and x4. assert_equal(ensemble.vote(entity_matrix), [x1, x2, x4])
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. """ annotated_documents = [] for document in X: content = sentence_to_tokens(document.plain_text_) output = self.model.analyze(content) substring_index = 0 annotations = [] for entity in output["entities"]: start_idx, end_idx = _get_offsets_with_fuzzy_matching( document.plain_text_, entity["text"], substring_index) offset = (start_idx, end_idx - 1) annotations.append( Annotation(document.plain_text_[start_idx:end_idx], self._label_map[entity["type"]], offset)) substring_index = end_idx annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. In a dictionary based approach, a dictionary of keywords is used to create a FSA which is then used to search with. See [1]. [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm """ annotated_documents = [] for document in X: annotations = [] doc_content_str = document.plain_text_ for item in self.automaton.iter(doc_content_str): end_position, (label, word) = item start_position = (end_position - len(word) + 1) end_position = end_position + 1 # Aho-Corasick matches partial strings in the input document, which # leads to spurious matches, so we check to see that the match spans # a full word before adding it to our list of valid annotations if ((start_position <= 0 and doc_content_str[end_position] == " ") or (end_position >= len(doc_content_str) and doc_content_str[start_position - 1] == " ") or (doc_content_str[start_position - 1] == " " and doc_content_str[end_position] == " ")): annotations.append( Annotation(word, label, (start_position, end_position))) annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def _read_brat_ann_file(self, path_to_ann_file): """ Helper function to read brat annotations. TODO: Right now, it reads only ENTITIES from BRAT ann files, but we need to extend it to also read ENTITY RELATIONSHIPS. """ annotations = set() if isfile(path_to_ann_file): with open(path_to_ann_file, 'rb') as ann_file: for ann_line in ann_file: ann_line = ann_line.decode(self.encoding) # Comments start with a hash if ann_line.startswith("#"): continue split_ann_line = ann_line.strip().split("\t") # Must be exactly 3 things, if they are entity related. # e.g.: "TEmma2\tGrant 475 491\tGIA G-14-0006063" # The annotations can also be relations. # TODO: Add code to read relations. if len(split_ann_line) > 2: entity_str = split_ann_line[2] # Looks like "Grant 475 491" entity_type_offsets = split_ann_line[1].split(" ") entity_name = entity_type_offsets[0] start_offset = int(entity_type_offsets[1]) end_offset = int(entity_type_offsets[2]) - 1 annotations.add(Annotation( entity_str, entity_name, (start_offset, end_offset))) return sorted(list(annotations))
def test_ensemble_weighted_vote(): ensemble = NERModelEnsembleWeightedVote( [NERModel(), NERModel(), NERModel()]) ensemble.confidence_scores = [0.4, 0.7, 0.3] x1 = Annotation("pizza", "FOOD", (1, 6)) x2 = Annotation("milk", "FOOD", (7, 11)) x3 = Annotation("cupcake", "FOOD", (12, 19)) x4 = Annotation("kebab", "FOOD", (20, 25)) x5 = Annotation("pie", "FOOD", (29, 32)) x6 = Annotation("cheese", "FOOD", (35, 41)) entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]] # Unlike the majority vote, here we expect to see x3 and x5 in the # annotations, because they come from a classifier of significantly # higher confidence. assert_equal(ensemble.vote(entity_matrix), [x1, x2, x3, x4, x5])
def split_annotated_document( annotated_document, splitter=document_to_sentences): """ Splits an annotated document and maintains the annotation offsets. This function accepts an AnnotatedDocument object as parameter along with an optional tokenization method. It splits the document according to the tokenization method, and returns a list of AnnotatedDocument objects, where the annotation offsets have been adjusted. Args: annotated_document (AnnotatedDocument): The document that will be split into more documents. splitter: (func, optional): A function that accepts a string as input and returns a list of strings. Defaults to `document_to_sentences`, which is the default sentence splitter for this library. Returns: list(AnnotatedDocument): A list of annotated documents. """ snippets = [ snippet.strip() for snippet in splitter(annotated_document.plain_text_)] annotations = annotated_document.annotations cur_snippet_idx = 0 cur_ann_idx = 0 result_ann = [] # Iterate every snippet of text and isolate its annotations. # Then construct a single AnnotatedDocument object with them. while cur_snippet_idx < len(snippets) and cur_ann_idx < len(annotations): cur_substring_idx = 0 token_ann = [] cur_snippet = snippets[cur_snippet_idx] cur_annotation_text = annotations[cur_ann_idx].text cur_annotation_label = annotations[cur_ann_idx].label idx_found = cur_snippet.find(cur_annotation_text, cur_substring_idx) # Iterate the annotations for as long as we keep finding them in # the current snippet of text. while idx_found != -1: cur_annotation_offsets = ( idx_found, idx_found + len(cur_annotation_text) - 1) token_ann.append(Annotation( cur_annotation_text, cur_annotation_label, cur_annotation_offsets)) cur_substring_idx = idx_found + len(cur_annotation_text) cur_ann_idx += 1 if cur_ann_idx < len(annotations): cur_annotation_text = annotations[cur_ann_idx].text cur_annotation_label = annotations[cur_ann_idx].label idx_found = cur_snippet.find( cur_annotation_text, cur_substring_idx) else: break result_ann.append(AnnotatedDocument( cur_snippet.encode(annotated_document.encoding), token_ann)) cur_snippet_idx += 1 return result_ann
def transform_bio_tags_to_annotated_document(tokens, bio_tags, document): """ Given a list of tokens, a list of BIO tags, and a document object, this function returns annotated documents formed from this information. Example: doc -> "Barack Obama lives in the White House" tokens -> [['Barack', 'Obama', 'lives', 'in', 'the', 'White', 'House']] bio -> [['B_Person', 'I_Person', '0', '0', 'B_Institution', 'I_Institution','I_Institution']] It returns: AnnotatedDocument( content = "Barack Obama lives in the White House" annotations = ( (Barack Obama, Person, (0, 11)) (White House, Person, (26, 36)) ) ) """ content = document.plain_text_ cur_token_idx = 0 cur_substring_idx = 0 annotations = [] while cur_token_idx < len(bio_tags): cur_token = tokens[cur_token_idx] cur_tag = bio_tags[cur_token_idx] if not cur_tag.startswith("B"): cur_substring_idx += len(cur_token) cur_token_idx += 1 continue cur_label = cur_tag.split("_")[1] # Get the absolute start of the entity, given the index # which stores information about the previously detected # entity offset. start_idx = content.find(cur_token, cur_substring_idx) end_idx = start_idx + len(cur_token) if cur_token_idx + 1 < len(bio_tags): next_tag = bio_tags[cur_token_idx + 1] # If last word skip the following if next_tag.startswith("I"): while next_tag.startswith("I"): cur_token_idx += 1 cur_token = tokens[cur_token_idx] try: next_tag = bio_tags[cur_token_idx + 1] except IndexError: break tmp_idx = content.find(cur_token, cur_substring_idx) # This line overwrites end_idx, in case there is a # multi-term annotation. end_idx = tmp_idx + len(cur_token) # Ends at the last character, and not after! idx_tuple = (start_idx, end_idx - 1) cur_substring_idx = end_idx annotations.append(Annotation( content[start_idx:end_idx], cur_label, idx_tuple)) cur_token_idx += 1 return AnnotatedDocument( document.content, annotations=annotations, encoding=document.encoding)
def test_to_inline_string(): annotation = Annotation("pizza", "FOOD", (1, 6)) assert_equal(annotation.to_inline_string(), "FOOD[pizza]")
def test_str(): annotation = Annotation("pizza", "FOOD", (1, 6)) assert_equal(str(annotation), "1,6 FOOD[pizza]")
def test_hash(): annotation_1 = Annotation("pizza", "FOOD", (1, 6)) annotation_2 = Annotation("pizza", "FOOD", (1, 6)) assert_equal(hash(annotation_1), hash(annotation_2))
def test_split_annotated_document(): content = (b"The quick brown fox jumps over the lazy dog. " b"Grumpy wizards make a toxic brew for the jovial queen.") annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)), Annotation("wizards", "PERSON", (52, 58)), Annotation("brew", "DRINK", (73, 76)), Annotation("queen", "PERSON", (93, 97)) ] annotated_document = AnnotatedDocument(content, annotations) result = split_annotated_document(annotated_document) assert_equal(result[0].content, b"The quick brown fox jumps over the lazy dog.") assert_equal(result[1].content, b"Grumpy wizards make a toxic brew for the jovial queen.") expected_annotations_doc1 = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] assert_equal(result[0].annotations, expected_annotations_doc1) expected_annotations_doc2 = [ Annotation("wizards", "PERSON", (7, 13)), Annotation("brew", "DRINK", (28, 31)), Annotation("queen", "PERSON", (48, 52)) ] assert_equal(result[1].annotations, expected_annotations_doc2)