Esempio n. 1
0
def test_crf():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    # Train.
    mod = CRF()
    mod.fit([annotated_document])

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = mod.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))
Esempio n. 2
0
def test_transform_annotated_documents_to_bio_format():

    # Test no annotations.
    content = b"The quick brown fox jumps over the lazy dog."
    annotated_document = AnnotatedDocument(content, annotations=None)

    expected = ([[
        'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog',
        '.'
    ]], [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])

    transformed = transform_annotated_documents_to_bio_format(
        [annotated_document])
    assert_equal(transformed, expected)

    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document = AnnotatedDocument(content, annotations=annotations)
    expected = ([[
        'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog',
        '.'
    ]], [[
        'O', 'O', 'B_COLOR', 'B_ANIMAL', 'O', 'O', 'O', 'O', 'B_ANIMAL', 'O'
    ]])

    transformed = transform_annotated_documents_to_bio_format(
        [annotated_document])
    assert_equal(transformed, expected)
Esempio n. 3
0
def test_crf_multi_term():
    content = b"The dark brown fox jumps over the lazy dog magnificently."
    annotations = [
        Annotation("dark brown", "COLOR", (4, 13)),
        Annotation("fox", "ANIMAL", (15, 17)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    # Train.
    mod = CRF()
    mod.fit([annotated_document])

    # Predict. Works!
    content = b"The dark brown fox."
    document = Document(content)

    ann = mod.transform([document])

    # 2 annotations, dark brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "dark brown")
    assert_equal(ann[0].annotations[0].offset, (4, 13))
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].offset, (15, 17))
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
Esempio n. 4
0
def test_comparison_operators():
    annotation_1 = Annotation("pizza", "FOOD", (1, 6))
    annotation_2 = Annotation("pizza", "FOOD", (1, 6))
    annotation_3 = Annotation("cupcake", "FOOD", (9, 16))
    annotation_4 = Annotation("milk", "FOOD", (7, 11))

    assert_true(annotation_3 > annotation_1)
    assert_true(annotation_4 < annotation_3)

    assert_true(annotation_3 >= annotation_1)
    assert_true(annotation_4 <= annotation_3)

    assert_true(annotation_1 == annotation_2)
Esempio n. 5
0
def test_optimizer():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # Test it with CRF because it's the least time-consuming one to train.
    crf = CRF()
    hparams = {
        "c1": ExactListParam([0.1, 0.9]),
        "c2": ExactListParam([0.1, 0.9])
    }
    optimizer = Optimizer(crf, hparams, "COLOR", cv=3)
    best_estimator, f1score = optimizer.optimize_and_return_best([
        annotated_document_1, annotated_document_2, annotated_document_3
    ])

    assert_greater_equal(f1score, 0.5)
Esempio n. 6
0
def test_kfold_cv():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # Test it with CRF because it's the least time-consuming one to train.
    crf = CRF()
    kfold = KFoldCV(crf, k=3)
    average_f1 = kfold.cross_validate(
        [annotated_document_1, annotated_document_2, annotated_document_3],
        {"max_iterations": 100})

    # The examples and k are selected in a way where this always happens.
    assert_equal(average_f1, 0.5)
Esempio n. 7
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            In a dictionary based approach, a dictionary of keywords is used
            to create a FSA which is then used to search with. See [1].
            [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
        """
        annotated_documents = []
        for document in X:
            annotations = []
            doc_content_str = document.plain_text_
            for item in self.automaton.iter(doc_content_str):
                end_position, (index, word) = item

                start_position = (end_position - len(word) + 1)
                end_position = end_position + 1

                annotations.append(
                    Annotation(word, self.entity_label,
                               (start_position, end_position)))

            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  annotations=annotations,
                                  encoding=document.encoding))

        return annotated_documents
Esempio n. 8
0
def test_transform_to_spacy_format():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    expected = [("The quick brown fox jumps over the lazy dog.", {
        "entities": [(10, 15, "COLOR"), (16, 19, "ANIMAL"), (40, 43, "ANIMAL")]
    })]

    model = SpaCyStatisticalNER()
    transformed = model._transform_to_spacy_format([annotated_document])

    assert_equal(transformed, expected)
Esempio n. 9
0
def test_ExactMatchMultiClassDictionaryNER2():
    documents = [
        AnnotatedDocument(b"""
        In this study , we have used the polymerase chain reaction ( PCR ) with nested 
        primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils 
        from female patients with eosinophilia .
        """, annotations= [
            Annotation("HUMARA loci", "DNA", (139, 150)),
            Annotation("purified eosinophils", "cell-type", (154, 174))
        ])]
    ner = ExactMatchMultiClassDictionaryNER(
        "nerds/test/data/dictionary/biodictionary.txt")
    ner.fit(documents)
    pred_documents = ner.transform(documents)
    for i, annotation in enumerate(pred_documents[0].annotations):
        pred_text = annotation.text
        pred_offsets = annotation.offset
        label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]]
        assert_equal(pred_text, label_text, 
            "predicted {:s} != label {:s}".format(pred_text, label_text))
        assert_equal(annotation.label, expected_labels[i])
Esempio n. 10
0
def test_spacy():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    data = [annotated_document_1, annotated_document_2, annotated_document_3]

    # Train.
    model = SpaCyStatisticalNER()
    model.fit(data, num_epochs=5)

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = model.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))

    entities = model.extract([document])
    assert_equal(len(entities[0]), 2)
Esempio n. 11
0
def test_spacy_multi_term():
    content = b"The quick dark brown fox jumps over the lazy dog."
    annotations = [
        Annotation("dark brown", "COLOR", (10, 19)),
        Annotation("fox", "ANIMAL", (21, 23)),
        Annotation("dog", "ANIMAL", (45, 47))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A dark brown fox jumps quickly."
    annotations = [
        Annotation("dark brown", "COLOR", (2, 11)),
        Annotation("fox", "ANIMAL", (13, 15))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was dark brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("dark brown", "COLOR", (17, 26)),
        Annotation("dog", "ANIMAL", (41, 43))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    data = [annotated_document_1, annotated_document_2, annotated_document_3]

    # Train.
    model = SpaCyStatisticalNER()
    model.fit(data, num_epochs=5)

    # Predict. Works!
    content = b"The dark brown fox."
    document = Document(content)

    ann = model.transform([document])

    # 2 annotations, dark brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "dark brown")
    assert_equal(ann[0].annotations[0].offset, (4, 13))
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].offset, (15, 17))
    assert_equal(ann[0].annotations[1].label, "ANIMAL")

    entities = model.extract([document])
    assert_equal(len(entities[0]), 2)
Esempio n. 12
0
def test_ner_ensemble_configuration():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # In that config file we have params for a CRF with c1 = 0.1 and c2 = 0.1.
    ner_ensemble_config = NERModelEnsembleConfiguration(
        "nerds/test/data/config/sample.yaml")
    ner_ensemble_config.fit([
        annotated_document_1,
        annotated_document_2,
        annotated_document_3])

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = ner_ensemble_config.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))
Esempio n. 13
0
 def transform(self, X, y=None):
     """ Annotates the list of `Document` objects that are provided as
         input and returns a list of `AnnotatedDocument` objects.
     """
     annotated_documents = []
     for document in X:
         annotated_document = self.nlp(document.plain_text_)
         annotations = []
         for named_entity in annotated_document.ents:
             annotations.append(
                 Annotation(
                     named_entity.text, named_entity.label_,
                     (named_entity.start_char, named_entity.end_char - 1)))
         annotated_documents.append(
             AnnotatedDocument(document.content,
                               annotations=annotations,
                               encoding=document.encoding))
     return annotated_documents
Esempio n. 14
0
def test_ensemble_pooling():
    ensemble = NERModelEnsemblePooling([NERModel(), NERModel(), NERModel()])

    x1 = Annotation("pizza", "FOOD", (1, 6))
    x2 = Annotation("milk", "FOOD", (7, 11))
    x3 = Annotation("cupcake", "FOOD", (12, 19))
    x4 = Annotation("kebab", "FOOD", (20, 25))
    x5 = Annotation("pie", "FOOD", (29, 32))
    x6 = Annotation("cheese", "FOOD", (35, 41))

    entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]]

    assert_equal(ensemble.vote(entity_matrix), [x1, x2, x3, x4, x5, x6])
Esempio n. 15
0
def test_ensemble_majority_vote():
    ensemble = NERModelEnsembleMajorityVote(
        [NERModel(), NERModel(), NERModel()])

    x1 = Annotation("pizza", "FOOD", (1, 6))
    x2 = Annotation("milk", "FOOD", (7, 11))
    x3 = Annotation("cupcake", "FOOD", (12, 19))
    x4 = Annotation("kebab", "FOOD", (20, 25))
    x5 = Annotation("pie", "FOOD", (29, 32))
    x6 = Annotation("cheese", "FOOD", (35, 41))

    entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]]

    # Majority vote: 2 out of 3 classifiers voted for x1, x2 and x4.
    assert_equal(ensemble.vote(entity_matrix), [x1, x2, x4])
Esempio n. 16
0
 def transform(self, X, y=None):
     """ Annotates the list of `Document` objects that are provided as
         input and returns a list of `AnnotatedDocument` objects.
     """
     annotated_documents = []
     for document in X:
         content = sentence_to_tokens(document.plain_text_)
         output = self.model.analyze(content)
         substring_index = 0
         annotations = []
         for entity in output["entities"]:
             start_idx, end_idx = _get_offsets_with_fuzzy_matching(
                 document.plain_text_, entity["text"], substring_index)
             offset = (start_idx, end_idx - 1)
             annotations.append(
                 Annotation(document.plain_text_[start_idx:end_idx],
                            self._label_map[entity["type"]], offset))
             substring_index = end_idx
         annotated_documents.append(
             AnnotatedDocument(document.content,
                               annotations=annotations,
                               encoding=document.encoding))
     return annotated_documents
Esempio n. 17
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            In a dictionary based approach, a dictionary of keywords is used
            to create a FSA which is then used to search with. See [1].
            [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
        """
        annotated_documents = []
        for document in X:
            annotations = []
            doc_content_str = document.plain_text_
            for item in self.automaton.iter(doc_content_str):
                end_position, (label, word) = item

                start_position = (end_position - len(word) + 1)
                end_position = end_position + 1

                # Aho-Corasick matches partial strings in the input document, which
                # leads to spurious matches, so we check to see that the match spans
                # a full word before adding it to our list of valid annotations
                if ((start_position <= 0
                     and doc_content_str[end_position] == " ")
                        or (end_position >= len(doc_content_str)
                            and doc_content_str[start_position - 1] == " ")
                        or (doc_content_str[start_position - 1] == " "
                            and doc_content_str[end_position] == " ")):
                    annotations.append(
                        Annotation(word, label,
                                   (start_position, end_position)))

            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  annotations=annotations,
                                  encoding=document.encoding))

        return annotated_documents
Esempio n. 18
0
    def _read_brat_ann_file(self, path_to_ann_file):
        """ Helper function to read brat annotations.
            TODO: Right now, it reads only ENTITIES from BRAT ann files,
            but we need to extend it to also read ENTITY RELATIONSHIPS.
        """

        annotations = set()

        if isfile(path_to_ann_file):
            with open(path_to_ann_file, 'rb') as ann_file:
                for ann_line in ann_file:
                    ann_line = ann_line.decode(self.encoding)
                    # Comments start with a hash
                    if ann_line.startswith("#"):
                        continue

                    split_ann_line = ann_line.strip().split("\t")

                    # Must be exactly 3 things, if they are entity related.
                    # e.g.: "TEmma2\tGrant 475 491\tGIA G-14-0006063"
                    # The annotations can also be relations.
                    # TODO: Add code to read relations.
                    if len(split_ann_line) > 2:
                        entity_str = split_ann_line[2]

                        # Looks like "Grant 475 491"
                        entity_type_offsets = split_ann_line[1].split(" ")
                        entity_name = entity_type_offsets[0]
                        start_offset = int(entity_type_offsets[1])
                        end_offset = int(entity_type_offsets[2]) - 1

                        annotations.add(Annotation(
                            entity_str, entity_name,
                            (start_offset, end_offset)))

        return sorted(list(annotations))
Esempio n. 19
0
def test_ensemble_weighted_vote():
    ensemble = NERModelEnsembleWeightedVote(
        [NERModel(), NERModel(), NERModel()])

    ensemble.confidence_scores = [0.4, 0.7, 0.3]

    x1 = Annotation("pizza", "FOOD", (1, 6))
    x2 = Annotation("milk", "FOOD", (7, 11))
    x3 = Annotation("cupcake", "FOOD", (12, 19))
    x4 = Annotation("kebab", "FOOD", (20, 25))
    x5 = Annotation("pie", "FOOD", (29, 32))
    x6 = Annotation("cheese", "FOOD", (35, 41))

    entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]]

    # Unlike the majority vote, here we expect to see x3 and x5 in the
    # annotations, because they come from a classifier of significantly
    # higher confidence.
    assert_equal(ensemble.vote(entity_matrix), [x1, x2, x3, x4, x5])
Esempio n. 20
0
def split_annotated_document(
        annotated_document, splitter=document_to_sentences):
    """ Splits an annotated document and maintains the annotation offsets.

        This function accepts an AnnotatedDocument object as parameter along
        with an optional tokenization method. It splits the document according
        to the tokenization method, and returns a list of AnnotatedDocument
        objects, where the annotation offsets have been adjusted.

        Args:
            annotated_document (AnnotatedDocument): The document that will be
                split into more documents.
            splitter: (func, optional): A function that accepts a string as
                input and returns a list of strings. Defaults to
                `document_to_sentences`, which is the default sentence splitter
                for this library.

        Returns:
            list(AnnotatedDocument): A list of annotated documents.
    """

    snippets = [
        snippet.strip() for snippet in
        splitter(annotated_document.plain_text_)]
    annotations = annotated_document.annotations

    cur_snippet_idx = 0
    cur_ann_idx = 0

    result_ann = []

    # Iterate every snippet of text and isolate its annotations.
    # Then construct a single AnnotatedDocument object with them.
    while cur_snippet_idx < len(snippets) and cur_ann_idx < len(annotations):
        cur_substring_idx = 0

        token_ann = []

        cur_snippet = snippets[cur_snippet_idx]

        cur_annotation_text = annotations[cur_ann_idx].text
        cur_annotation_label = annotations[cur_ann_idx].label
        idx_found = cur_snippet.find(cur_annotation_text, cur_substring_idx)
        # Iterate the annotations for as long as we keep finding them in
        # the current snippet of text.
        while idx_found != -1:
            cur_annotation_offsets = (
                idx_found, idx_found + len(cur_annotation_text) - 1)
            token_ann.append(Annotation(
                cur_annotation_text,
                cur_annotation_label,
                cur_annotation_offsets))

            cur_substring_idx = idx_found + len(cur_annotation_text)
            cur_ann_idx += 1

            if cur_ann_idx < len(annotations):
                cur_annotation_text = annotations[cur_ann_idx].text
                cur_annotation_label = annotations[cur_ann_idx].label
                idx_found = cur_snippet.find(
                    cur_annotation_text, cur_substring_idx)
            else:
                break

        result_ann.append(AnnotatedDocument(
            cur_snippet.encode(annotated_document.encoding),
            token_ann))
        cur_snippet_idx += 1

    return result_ann
Esempio n. 21
0
def transform_bio_tags_to_annotated_document(tokens, bio_tags, document):
    """ Given a list of tokens, a list of BIO tags, and a document object,
        this function returns annotated documents formed from this information.

        Example:
            doc -> "Barack Obama lives in the White House"
            tokens ->
            [['Barack', 'Obama', 'lives', 'in', 'the', 'White', 'House']]
            bio ->
            [['B_Person', 'I_Person', '0', '0', 'B_Institution',
            'I_Institution','I_Institution']]

        It returns:
        AnnotatedDocument(
        content = "Barack Obama lives in the White House"
        annotations = (
            (Barack Obama, Person, (0, 11))
            (White House, Person, (26, 36))
            )
        )
    """
    content = document.plain_text_

    cur_token_idx = 0
    cur_substring_idx = 0

    annotations = []
    while cur_token_idx < len(bio_tags):
        cur_token = tokens[cur_token_idx]
        cur_tag = bio_tags[cur_token_idx]

        if not cur_tag.startswith("B"):
            cur_substring_idx += len(cur_token)
            cur_token_idx += 1
            continue

        cur_label = cur_tag.split("_")[1]

        # Get the absolute start of the entity, given the index
        # which stores information about the previously detected
        # entity offset.
        start_idx = content.find(cur_token, cur_substring_idx)
        end_idx = start_idx + len(cur_token)

        if cur_token_idx + 1 < len(bio_tags):
            next_tag = bio_tags[cur_token_idx + 1]
            # If last word skip the following
            if next_tag.startswith("I"):
                while next_tag.startswith("I"):
                    cur_token_idx += 1
                    cur_token = tokens[cur_token_idx]
                    try:
                        next_tag = bio_tags[cur_token_idx + 1]
                    except IndexError:
                        break

                tmp_idx = content.find(cur_token, cur_substring_idx)
                # This line overwrites end_idx, in case there is a
                # multi-term annotation.
                end_idx = tmp_idx + len(cur_token)

        # Ends at the last character, and not after!
        idx_tuple = (start_idx, end_idx - 1)
        cur_substring_idx = end_idx

        annotations.append(Annotation(
            content[start_idx:end_idx],
            cur_label,
            idx_tuple))

        cur_token_idx += 1

    return AnnotatedDocument(
        document.content, annotations=annotations, encoding=document.encoding)
Esempio n. 22
0
def test_to_inline_string():
    annotation = Annotation("pizza", "FOOD", (1, 6))
    assert_equal(annotation.to_inline_string(), "FOOD[pizza]")
Esempio n. 23
0
def test_str():
    annotation = Annotation("pizza", "FOOD", (1, 6))
    assert_equal(str(annotation), "1,6 FOOD[pizza]")
Esempio n. 24
0
def test_hash():
    annotation_1 = Annotation("pizza", "FOOD", (1, 6))
    annotation_2 = Annotation("pizza", "FOOD", (1, 6))

    assert_equal(hash(annotation_1), hash(annotation_2))
Esempio n. 25
0
def test_split_annotated_document():
    content = (b"The quick brown fox jumps over the lazy dog. "
               b"Grumpy wizards make a toxic brew for the jovial queen.")
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42)),
        Annotation("wizards", "PERSON", (52, 58)),
        Annotation("brew", "DRINK", (73, 76)),
        Annotation("queen", "PERSON", (93, 97))
    ]
    annotated_document = AnnotatedDocument(content, annotations)
    result = split_annotated_document(annotated_document)

    assert_equal(result[0].content,
                 b"The quick brown fox jumps over the lazy dog.")
    assert_equal(result[1].content,
                 b"Grumpy wizards make a toxic brew for the jovial queen.")

    expected_annotations_doc1 = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    assert_equal(result[0].annotations, expected_annotations_doc1)

    expected_annotations_doc2 = [
        Annotation("wizards", "PERSON", (7, 13)),
        Annotation("brew", "DRINK", (28, 31)),
        Annotation("queen", "PERSON", (48, 52))
    ]
    assert_equal(result[1].annotations, expected_annotations_doc2)