def test_optimizer(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10))] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38))] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) # Test it with CRF because it's the least time-consuming one to train. crf = CRF() hparams = { "c1": ExactListParam([0.1, 0.9]), "c2": ExactListParam([0.1, 0.9]) } optimizer = Optimizer(crf, hparams, "COLOR", cv=3) best_estimator, f1score = optimizer.optimize_and_return_best([ annotated_document_1, annotated_document_2, annotated_document_3 ]) assert_greater_equal(f1score, 0.5)
def test_transform_annotated_documents_to_bio_format(): # Test no annotations. content = b"The quick brown fox jumps over the lazy dog." annotated_document = AnnotatedDocument(content, annotations=None) expected = ([[ 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.' ]], [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]) transformed = transform_annotated_documents_to_bio_format( [annotated_document]) assert_equal(transformed, expected) annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document = AnnotatedDocument(content, annotations=annotations) expected = ([[ 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.' ]], [[ 'O', 'O', 'B_COLOR', 'B_ANIMAL', 'O', 'O', 'O', 'O', 'B_ANIMAL', 'O' ]]) transformed = transform_annotated_documents_to_bio_format( [annotated_document]) assert_equal(transformed, expected)
def test_kfold_cv(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10)) ] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38)) ] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) # Test it with CRF because it's the least time-consuming one to train. crf = CRF() kfold = KFoldCV(crf, k=3) average_f1 = kfold.cross_validate( [annotated_document_1, annotated_document_2, annotated_document_3], {"max_iterations": 100}) # The examples and k are selected in a way where this always happens. assert_equal(average_f1, 0.5)
def test_split_annotated_document(): content = (b"The quick brown fox jumps over the lazy dog. " b"Grumpy wizards make a toxic brew for the jovial queen.") annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)), Annotation("wizards", "PERSON", (52, 58)), Annotation("brew", "DRINK", (73, 76)), Annotation("queen", "PERSON", (93, 97)) ] annotated_document = AnnotatedDocument(content, annotations) result = split_annotated_document(annotated_document) assert_equal(result[0].content, b"The quick brown fox jumps over the lazy dog.") assert_equal(result[1].content, b"Grumpy wizards make a toxic brew for the jovial queen.") expected_annotations_doc1 = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] assert_equal(result[0].annotations, expected_annotations_doc1) expected_annotations_doc2 = [ Annotation("wizards", "PERSON", (7, 13)), Annotation("brew", "DRINK", (28, 31)), Annotation("queen", "PERSON", (48, 52)) ] assert_equal(result[1].annotations, expected_annotations_doc2)
def test_crf_multi_term(): content = b"The dark brown fox jumps over the lazy dog magnificently." annotations = [ Annotation("dark brown", "COLOR", (4, 13)), Annotation("fox", "ANIMAL", (15, 17)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document = AnnotatedDocument(content, annotations=annotations) # Train. mod = CRF() mod.fit([annotated_document]) # Predict. Works! content = b"The dark brown fox." document = Document(content) ann = mod.transform([document]) # 2 annotations, dark brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "dark brown") assert_equal(ann[0].annotations[0].offset, (4, 13)) assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].offset, (15, 17)) assert_equal(ann[0].annotations[1].label, "ANIMAL")
def test_crf(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document = AnnotatedDocument(content, annotations=annotations) # Train. mod = CRF() mod.fit([annotated_document]) # Predict. Works! content = b"The quick brown fox." document = Document(content) ann = mod.transform([document]) # 2 annotations, brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "brown") assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[0].offset, (10, 14)) assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].label, "ANIMAL") assert_equal(ann[0].annotations[1].offset, (16, 18))
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. In a dictionary based approach, a dictionary of keywords is used to create a FSA which is then used to search with. See [1]. [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm """ annotated_documents = [] for document in X: annotations = [] doc_content_str = document.plain_text_ for item in self.automaton.iter(doc_content_str): end_position, (index, word) = item start_position = (end_position - len(word) + 1) end_position = end_position + 1 annotations.append( Annotation(word, self.entity_label, (start_position, end_position))) annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def transform(self, X=None, y=None): """ Transforms the available documents into the appropriate objects, differentiating on the `annotated` parameter. """ # If not annotated, fall back to base class and simply read files if not self.annotated: return super().transform(X, y) # Else, read txt/ann else: annotated_docs = [] for found in os.listdir(self.path): f = join(self.path, found) if f.endswith(".txt") and isfile(f): # Standard brat folder structure: # For every txt there should be an ann. brat_f = f.replace(".txt", ".ann") annotations = self._read_brat_ann_file(brat_f) with open(f, 'rb') as doc_file: annotated_docs.append(AnnotatedDocument( doc_file.read(), annotations, self.encoding)) return annotated_docs
def test_spacy_multi_term(): content = b"The quick dark brown fox jumps over the lazy dog." annotations = [ Annotation("dark brown", "COLOR", (10, 19)), Annotation("fox", "ANIMAL", (21, 23)), Annotation("dog", "ANIMAL", (45, 47)) ] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A dark brown fox jumps quickly." annotations = [ Annotation("dark brown", "COLOR", (2, 11)), Annotation("fox", "ANIMAL", (13, 15)) ] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was dark brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("dark brown", "COLOR", (17, 26)), Annotation("dog", "ANIMAL", (41, 43)) ] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) data = [annotated_document_1, annotated_document_2, annotated_document_3] # Train. model = SpaCyStatisticalNER() model.fit(data, num_epochs=5) # Predict. Works! content = b"The dark brown fox." document = Document(content) ann = model.transform([document]) # 2 annotations, dark brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "dark brown") assert_equal(ann[0].annotations[0].offset, (4, 13)) assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].offset, (15, 17)) assert_equal(ann[0].annotations[1].label, "ANIMAL") entities = model.extract([document]) assert_equal(len(entities[0]), 2)
def test_spacy(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10)) ] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38)) ] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) data = [annotated_document_1, annotated_document_2, annotated_document_3] # Train. model = SpaCyStatisticalNER() model.fit(data, num_epochs=5) # Predict. Works! content = b"The quick brown fox." document = Document(content) ann = model.transform([document]) # 2 annotations, brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "brown") assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[0].offset, (10, 14)) assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].label, "ANIMAL") assert_equal(ann[0].annotations[1].offset, (16, 18)) entities = model.extract([document]) assert_equal(len(entities[0]), 2)
def test_ner_ensemble_configuration(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42))] annotated_document_1 = AnnotatedDocument(content, annotations=annotations) content = b"A brown fox jumps quickly." annotations = [ Annotation("brown", "COLOR", (2, 6)), Annotation("fox", "ANIMAL", (8, 10))] annotated_document_2 = AnnotatedDocument(content, annotations=annotations) content = b"The fox that was brown jumps over a dog that was lazy." annotations = [ Annotation("fox", "ANIMAL", (4, 6)), Annotation("brown", "COLOR", (17, 21)), Annotation("dog", "ANIMAL", (36, 38))] annotated_document_3 = AnnotatedDocument(content, annotations=annotations) # In that config file we have params for a CRF with c1 = 0.1 and c2 = 0.1. ner_ensemble_config = NERModelEnsembleConfiguration( "nerds/test/data/config/sample.yaml") ner_ensemble_config.fit([ annotated_document_1, annotated_document_2, annotated_document_3]) # Predict. Works! content = b"The quick brown fox." document = Document(content) ann = ner_ensemble_config.transform([document]) # 2 annotations, brown and fox assert_equal(len(ann[0].annotations), 2) assert_equal(ann[0].annotations[0].text, "brown") assert_equal(ann[0].annotations[0].label, "COLOR") assert_equal(ann[0].annotations[0].offset, (10, 14)) assert_equal(ann[0].annotations[1].text, "fox") assert_equal(ann[0].annotations[1].label, "ANIMAL") assert_equal(ann[0].annotations[1].offset, (16, 18))
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. The basic implementation of this method does not annotate any entities and should be overridden by offspring. """ annotated_documents = [] for document in X: annotated_documents.append( AnnotatedDocument(document.content, encoding=document.encoding)) return annotated_documents
def test_transform_to_spacy_format(): content = b"The quick brown fox jumps over the lazy dog." annotations = [ Annotation("brown", "COLOR", (10, 14)), Annotation("fox", "ANIMAL", (16, 18)), Annotation("dog", "ANIMAL", (40, 42)) ] annotated_document = AnnotatedDocument(content, annotations=annotations) expected = [("The quick brown fox jumps over the lazy dog.", { "entities": [(10, 15, "COLOR"), (16, 19, "ANIMAL"), (40, 43, "ANIMAL")] })] model = SpaCyStatisticalNER() transformed = model._transform_to_spacy_format([annotated_document]) assert_equal(transformed, expected)
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. Needs an implementation of the `vote` method. """ annotated_entities_per_model = [] for model in self.models: annotated_entities_per_model.append(model.extract(X, y)) annotated_documents = [] for doc_idx, document in enumerate(X): entity_matrix = np.array( annotated_entities_per_model)[:, doc_idx].tolist() annotated_documents.append( AnnotatedDocument(document.content, self.vote(entity_matrix), document.encoding)) return annotated_documents
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. """ annotated_documents = [] for document in X: annotated_document = self.nlp(document.plain_text_) annotations = [] for named_entity in annotated_document.ents: annotations.append( Annotation( named_entity.text, named_entity.label_, (named_entity.start_char, named_entity.end_char - 1))) annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def test_ExactMatchMultiClassDictionaryNER2(): documents = [ AnnotatedDocument(b""" In this study , we have used the polymerase chain reaction ( PCR ) with nested primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils from female patients with eosinophilia . """, annotations= [ Annotation("HUMARA loci", "DNA", (139, 150)), Annotation("purified eosinophils", "cell-type", (154, 174)) ])] ner = ExactMatchMultiClassDictionaryNER( "nerds/test/data/dictionary/biodictionary.txt") ner.fit(documents) pred_documents = ner.transform(documents) for i, annotation in enumerate(pred_documents[0].annotations): pred_text = annotation.text pred_offsets = annotation.offset label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]] assert_equal(pred_text, label_text, "predicted {:s} != label {:s}".format(pred_text, label_text)) assert_equal(annotation.label, expected_labels[i])
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. """ annotated_documents = [] for document in X: content = sentence_to_tokens(document.plain_text_) output = self.model.analyze(content) substring_index = 0 annotations = [] for entity in output["entities"]: start_idx, end_idx = _get_offsets_with_fuzzy_matching( document.plain_text_, entity["text"], substring_index) offset = (start_idx, end_idx - 1) annotations.append( Annotation(document.plain_text_[start_idx:end_idx], self._label_map[entity["type"]], offset)) substring_index = end_idx annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. In a dictionary based approach, a dictionary of keywords is used to create a FSA which is then used to search with. See [1]. [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm """ annotated_documents = [] for document in X: annotations = [] doc_content_str = document.plain_text_ for item in self.automaton.iter(doc_content_str): end_position, (label, word) = item start_position = (end_position - len(word) + 1) end_position = end_position + 1 # Aho-Corasick matches partial strings in the input document, which # leads to spurious matches, so we check to see that the match spans # a full word before adding it to our list of valid annotations if ((start_position <= 0 and doc_content_str[end_position] == " ") or (end_position >= len(doc_content_str) and doc_content_str[start_position - 1] == " ") or (doc_content_str[start_position - 1] == " " and doc_content_str[end_position] == " ")): annotations.append( Annotation(word, label, (start_position, end_position))) annotated_documents.append( AnnotatedDocument(document.content, annotations=annotations, encoding=document.encoding)) return annotated_documents
def split_annotated_document( annotated_document, splitter=document_to_sentences): """ Splits an annotated document and maintains the annotation offsets. This function accepts an AnnotatedDocument object as parameter along with an optional tokenization method. It splits the document according to the tokenization method, and returns a list of AnnotatedDocument objects, where the annotation offsets have been adjusted. Args: annotated_document (AnnotatedDocument): The document that will be split into more documents. splitter: (func, optional): A function that accepts a string as input and returns a list of strings. Defaults to `document_to_sentences`, which is the default sentence splitter for this library. Returns: list(AnnotatedDocument): A list of annotated documents. """ snippets = [ snippet.strip() for snippet in splitter(annotated_document.plain_text_)] annotations = annotated_document.annotations cur_snippet_idx = 0 cur_ann_idx = 0 result_ann = [] # Iterate every snippet of text and isolate its annotations. # Then construct a single AnnotatedDocument object with them. while cur_snippet_idx < len(snippets) and cur_ann_idx < len(annotations): cur_substring_idx = 0 token_ann = [] cur_snippet = snippets[cur_snippet_idx] cur_annotation_text = annotations[cur_ann_idx].text cur_annotation_label = annotations[cur_ann_idx].label idx_found = cur_snippet.find(cur_annotation_text, cur_substring_idx) # Iterate the annotations for as long as we keep finding them in # the current snippet of text. while idx_found != -1: cur_annotation_offsets = ( idx_found, idx_found + len(cur_annotation_text) - 1) token_ann.append(Annotation( cur_annotation_text, cur_annotation_label, cur_annotation_offsets)) cur_substring_idx = idx_found + len(cur_annotation_text) cur_ann_idx += 1 if cur_ann_idx < len(annotations): cur_annotation_text = annotations[cur_ann_idx].text cur_annotation_label = annotations[cur_ann_idx].label idx_found = cur_snippet.find( cur_annotation_text, cur_substring_idx) else: break result_ann.append(AnnotatedDocument( cur_snippet.encode(annotated_document.encoding), token_ann)) cur_snippet_idx += 1 return result_ann
def transform_bio_tags_to_annotated_document(tokens, bio_tags, document): """ Given a list of tokens, a list of BIO tags, and a document object, this function returns annotated documents formed from this information. Example: doc -> "Barack Obama lives in the White House" tokens -> [['Barack', 'Obama', 'lives', 'in', 'the', 'White', 'House']] bio -> [['B_Person', 'I_Person', '0', '0', 'B_Institution', 'I_Institution','I_Institution']] It returns: AnnotatedDocument( content = "Barack Obama lives in the White House" annotations = ( (Barack Obama, Person, (0, 11)) (White House, Person, (26, 36)) ) ) """ content = document.plain_text_ cur_token_idx = 0 cur_substring_idx = 0 annotations = [] while cur_token_idx < len(bio_tags): cur_token = tokens[cur_token_idx] cur_tag = bio_tags[cur_token_idx] if not cur_tag.startswith("B"): cur_substring_idx += len(cur_token) cur_token_idx += 1 continue cur_label = cur_tag.split("_")[1] # Get the absolute start of the entity, given the index # which stores information about the previously detected # entity offset. start_idx = content.find(cur_token, cur_substring_idx) end_idx = start_idx + len(cur_token) if cur_token_idx + 1 < len(bio_tags): next_tag = bio_tags[cur_token_idx + 1] # If last word skip the following if next_tag.startswith("I"): while next_tag.startswith("I"): cur_token_idx += 1 cur_token = tokens[cur_token_idx] try: next_tag = bio_tags[cur_token_idx + 1] except IndexError: break tmp_idx = content.find(cur_token, cur_substring_idx) # This line overwrites end_idx, in case there is a # multi-term annotation. end_idx = tmp_idx + len(cur_token) # Ends at the last character, and not after! idx_tuple = (start_idx, end_idx - 1) cur_substring_idx = end_idx annotations.append(Annotation( content[start_idx:end_idx], cur_label, idx_tuple)) cur_token_idx += 1 return AnnotatedDocument( document.content, annotations=annotations, encoding=document.encoding)