Esempi in Python per Annotation, esempi in Python per nerds.core.model.input.annotation.Annotation

Esempio n. 1

0

Mostra file

def test_crf():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    # Train.
    mod = CRF()
    mod.fit([annotated_document])

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = mod.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))

Esempio n. 2

0

Mostra file

File: test_util_convert.py Progetto: hamroune/nerds

def test_transform_annotated_documents_to_bio_format():

    # Test no annotations.
    content = b"The quick brown fox jumps over the lazy dog."
    annotated_document = AnnotatedDocument(content, annotations=None)

    expected = ([[
        'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog',
        '.'
    ]], [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])

    transformed = transform_annotated_documents_to_bio_format(
        [annotated_document])
    assert_equal(transformed, expected)

    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document = AnnotatedDocument(content, annotations=annotations)
    expected = ([[
        'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog',
        '.'
    ]], [[
        'O', 'O', 'B_COLOR', 'B_ANIMAL', 'O', 'O', 'O', 'O', 'B_ANIMAL', 'O'
    ]])

    transformed = transform_annotated_documents_to_bio_format(
        [annotated_document])
    assert_equal(transformed, expected)

Esempio n. 3

0

Mostra file

def test_crf_multi_term():
    content = b"The dark brown fox jumps over the lazy dog magnificently."
    annotations = [
        Annotation("dark brown", "COLOR", (4, 13)),
        Annotation("fox", "ANIMAL", (15, 17)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    # Train.
    mod = CRF()
    mod.fit([annotated_document])

    # Predict. Works!
    content = b"The dark brown fox."
    document = Document(content)

    ann = mod.transform([document])

    # 2 annotations, dark brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "dark brown")
    assert_equal(ann[0].annotations[0].offset, (4, 13))
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].offset, (15, 17))
    assert_equal(ann[0].annotations[1].label, "ANIMAL")

Esempio n. 4

0

Mostra file

def test_comparison_operators():
    annotation_1 = Annotation("pizza", "FOOD", (1, 6))
    annotation_2 = Annotation("pizza", "FOOD", (1, 6))
    annotation_3 = Annotation("cupcake", "FOOD", (9, 16))
    annotation_4 = Annotation("milk", "FOOD", (7, 11))

    assert_true(annotation_3 > annotation_1)
    assert_true(annotation_4 < annotation_3)

    assert_true(annotation_3 >= annotation_1)
    assert_true(annotation_4 <= annotation_3)

    assert_true(annotation_1 == annotation_2)

Esempio n. 5

0

Mostra file

def test_optimizer():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # Test it with CRF because it's the least time-consuming one to train.
    crf = CRF()
    hparams = {
        "c1": ExactListParam([0.1, 0.9]),
        "c2": ExactListParam([0.1, 0.9])
    }
    optimizer = Optimizer(crf, hparams, "COLOR", cv=3)
    best_estimator, f1score = optimizer.optimize_and_return_best([
        annotated_document_1, annotated_document_2, annotated_document_3
    ])

    assert_greater_equal(f1score, 0.5)

Esempio n. 6

0

Mostra file

File: test_kfold_cv.py Progetto: hamroune/nerds

def test_kfold_cv():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # Test it with CRF because it's the least time-consuming one to train.
    crf = CRF()
    kfold = KFoldCV(crf, k=3)
    average_f1 = kfold.cross_validate(
        [annotated_document_1, annotated_document_2, annotated_document_3],
        {"max_iterations": 100})

    # The examples and k are selected in a way where this always happens.
    assert_equal(average_f1, 0.5)

Esempio n. 7

0

Mostra file

    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            In a dictionary based approach, a dictionary of keywords is used
            to create a FSA which is then used to search with. See [1].
            [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
        """
        annotated_documents = []
        for document in X:
            annotations = []
            doc_content_str = document.plain_text_
            for item in self.automaton.iter(doc_content_str):
                end_position, (index, word) = item

                start_position = (end_position - len(word) + 1)
                end_position = end_position + 1

                annotations.append(
                    Annotation(word, self.entity_label,
                               (start_position, end_position)))

            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  annotations=annotations,
                                  encoding=document.encoding))

        return annotated_documents

Esempio n. 8

0

Mostra file

File: test_spacy_ner_model.py Progetto: hamroune/nerds

def test_transform_to_spacy_format():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    expected = [("The quick brown fox jumps over the lazy dog.", {
        "entities": [(10, 15, "COLOR"), (16, 19, "ANIMAL"), (40, 43, "ANIMAL")]
    })]

    model = SpaCyStatisticalNER()
    transformed = model._transform_to_spacy_format([annotated_document])

    assert_equal(transformed, expected)

Esempio n. 9

0

Mostra file

def test_ExactMatchMultiClassDictionaryNER2():
    documents = [
        AnnotatedDocument(b"""
        In this study , we have used the polymerase chain reaction ( PCR ) with nested 
        primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils 
        from female patients with eosinophilia .
        """, annotations= [
            Annotation("HUMARA loci", "DNA", (139, 150)),
            Annotation("purified eosinophils", "cell-type", (154, 174))
        ])]
    ner = ExactMatchMultiClassDictionaryNER(
        "nerds/test/data/dictionary/biodictionary.txt")
    ner.fit(documents)
    pred_documents = ner.transform(documents)
    for i, annotation in enumerate(pred_documents[0].annotations):
        pred_text = annotation.text
        pred_offsets = annotation.offset
        label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]]
        assert_equal(pred_text, label_text, 
            "predicted {:s} != label {:s}".format(pred_text, label_text))
        assert_equal(annotation.label, expected_labels[i])

Esempio n. 10

0

Mostra file

File: test_spacy_ner_model.py Progetto: hamroune/nerds

def test_spacy():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    data = [annotated_document_1, annotated_document_2, annotated_document_3]

    # Train.
    model = SpaCyStatisticalNER()
    model.fit(data, num_epochs=5)

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = model.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))

    entities = model.extract([document])
    assert_equal(len(entities[0]), 2)

Esempio n. 11

0

Mostra file

File: test_spacy_ner_model.py Progetto: hamroune/nerds

def test_spacy_multi_term():
    content = b"The quick dark brown fox jumps over the lazy dog."
    annotations = [
        Annotation("dark brown", "COLOR", (10, 19)),
        Annotation("fox", "ANIMAL", (21, 23)),
        Annotation("dog", "ANIMAL", (45, 47))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A dark brown fox jumps quickly."
    annotations = [
        Annotation("dark brown", "COLOR", (2, 11)),
        Annotation("fox", "ANIMAL", (13, 15))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was dark brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("dark brown", "COLOR", (17, 26)),
        Annotation("dog", "ANIMAL", (41, 43))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    data = [annotated_document_1, annotated_document_2, annotated_document_3]

    # Train.
    model = SpaCyStatisticalNER()
    model.fit(data, num_epochs=5)

    # Predict. Works!
    content = b"The dark brown fox."
    document = Document(content)

    ann = model.transform([document])

    # 2 annotations, dark brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "dark brown")
    assert_equal(ann[0].annotations[0].offset, (4, 13))
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].offset, (15, 17))
    assert_equal(ann[0].annotations[1].label, "ANIMAL")

    entities = model.extract([document])
    assert_equal(len(entities[0]), 2)

Esempio n. 12

0

Mostra file

def test_ner_ensemble_configuration():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # In that config file we have params for a CRF with c1 = 0.1 and c2 = 0.1.
    ner_ensemble_config = NERModelEnsembleConfiguration(
        "nerds/test/data/config/sample.yaml")
    ner_ensemble_config.fit([
        annotated_document_1,
        annotated_document_2,
        annotated_document_3])

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = ner_ensemble_config.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))

Esempio n. 13

0

Mostra file

File: spacy.py Progetto: hamroune/nerds

 def transform(self, X, y=None):
     """ Annotates the list of `Document` objects that are provided as
         input and returns a list of `AnnotatedDocument` objects.
     """
     annotated_documents = []
     for document in X:
         annotated_document = self.nlp(document.plain_text_)
         annotations = []
         for named_entity in annotated_document.ents:
             annotations.append(
                 Annotation(
                     named_entity.text, named_entity.label_,
                     (named_entity.start_char, named_entity.end_char - 1)))
         annotated_documents.append(
             AnnotatedDocument(document.content,
                               annotations=annotations,
                               encoding=document.encoding))
     return annotated_documents

Esempio n. 14

0

Mostra file

File: test_ensemble.py Progetto: hamroune/nerds

def test_ensemble_pooling():
    ensemble = NERModelEnsemblePooling([NERModel(), NERModel(), NERModel()])

    x1 = Annotation("pizza", "FOOD", (1, 6))
    x2 = Annotation("milk", "FOOD", (7, 11))
    x3 = Annotation("cupcake", "FOOD", (12, 19))
    x4 = Annotation("kebab", "FOOD", (20, 25))
    x5 = Annotation("pie", "FOOD", (29, 32))
    x6 = Annotation("cheese", "FOOD", (35, 41))

    entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]]

    assert_equal(ensemble.vote(entity_matrix), [x1, x2, x3, x4, x5, x6])

Esempio n. 15

0

Mostra file

File: test_ensemble.py Progetto: hamroune/nerds

def test_ensemble_majority_vote():
    ensemble = NERModelEnsembleMajorityVote(
        [NERModel(), NERModel(), NERModel()])

    x1 = Annotation("pizza", "FOOD", (1, 6))
    x2 = Annotation("milk", "FOOD", (7, 11))
    x3 = Annotation("cupcake", "FOOD", (12, 19))
    x4 = Annotation("kebab", "FOOD", (20, 25))
    x5 = Annotation("pie", "FOOD", (29, 32))
    x6 = Annotation("cheese", "FOOD", (35, 41))

    entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]]

    # Majority vote: 2 out of 3 classifiers voted for x1, x2 and x4.
    assert_equal(ensemble.vote(entity_matrix), [x1, x2, x4])

Esempio n. 16

0

Mostra file

File: bilstm.py Progetto: hamroune/nerds

 def transform(self, X, y=None):
     """ Annotates the list of `Document` objects that are provided as
         input and returns a list of `AnnotatedDocument` objects.
     """
     annotated_documents = []
     for document in X:
         content = sentence_to_tokens(document.plain_text_)
         output = self.model.analyze(content)
         substring_index = 0
         annotations = []
         for entity in output["entities"]:
             start_idx, end_idx = _get_offsets_with_fuzzy_matching(
                 document.plain_text_, entity["text"], substring_index)
             offset = (start_idx, end_idx - 1)
             annotations.append(
                 Annotation(document.plain_text_[start_idx:end_idx],
                            self._label_map[entity["type"]], offset))
             substring_index = end_idx
         annotated_documents.append(
             AnnotatedDocument(document.content,
                               annotations=annotations,
                               encoding=document.encoding))
     return annotated_documents

Esempio n. 17

0

Mostra file

    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            In a dictionary based approach, a dictionary of keywords is used
            to create a FSA which is then used to search with. See [1].
            [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
        """
        annotated_documents = []
        for document in X:
            annotations = []
            doc_content_str = document.plain_text_
            for item in self.automaton.iter(doc_content_str):
                end_position, (label, word) = item

                start_position = (end_position - len(word) + 1)
                end_position = end_position + 1

                # Aho-Corasick matches partial strings in the input document, which
                # leads to spurious matches, so we check to see that the match spans
                # a full word before adding it to our list of valid annotations
                if ((start_position <= 0
                     and doc_content_str[end_position] == " ")
                        or (end_position >= len(doc_content_str)
                            and doc_content_str[start_position - 1] == " ")
                        or (doc_content_str[start_position - 1] == " "
                            and doc_content_str[end_position] == " ")):
                    annotations.append(
                        Annotation(word, label,
                                   (start_position, end_position)))

            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  annotations=annotations,
                                  encoding=document.encoding))

        return annotated_documents

Esempio n. 18

0

Mostra file

    def _read_brat_ann_file(self, path_to_ann_file):
        """ Helper function to read brat annotations.
            TODO: Right now, it reads only ENTITIES from BRAT ann files,
            but we need to extend it to also read ENTITY RELATIONSHIPS.
        """

        annotations = set()

        if isfile(path_to_ann_file):
            with open(path_to_ann_file, 'rb') as ann_file:
                for ann_line in ann_file:
                    ann_line = ann_line.decode(self.encoding)
                    # Comments start with a hash
                    if ann_line.startswith("#"):
                        continue

                    split_ann_line = ann_line.strip().split("\t")

                    # Must be exactly 3 things, if they are entity related.
                    # e.g.: "TEmma2\tGrant 475 491\tGIA G-14-0006063"
                    # The annotations can also be relations.
                    # TODO: Add code to read relations.
                    if len(split_ann_line) > 2:
                        entity_str = split_ann_line[2]

                        # Looks like "Grant 475 491"
                        entity_type_offsets = split_ann_line[1].split(" ")
                        entity_name = entity_type_offsets[0]
                        start_offset = int(entity_type_offsets[1])
                        end_offset = int(entity_type_offsets[2]) - 1

                        annotations.add(Annotation(
                            entity_str, entity_name,
                            (start_offset, end_offset)))

        return sorted(list(annotations))

Esempio n. 19

0

Mostra file

File: test_ensemble.py Progetto: hamroune/nerds

def test_ensemble_weighted_vote():
    ensemble = NERModelEnsembleWeightedVote(
        [NERModel(), NERModel(), NERModel()])

    ensemble.confidence_scores = [0.4, 0.7, 0.3]

    x1 = Annotation("pizza", "FOOD", (1, 6))
    x2 = Annotation("milk", "FOOD", (7, 11))
    x3 = Annotation("cupcake", "FOOD", (12, 19))
    x4 = Annotation("kebab", "FOOD", (20, 25))
    x5 = Annotation("pie", "FOOD", (29, 32))
    x6 = Annotation("cheese", "FOOD", (35, 41))

    entity_matrix = [[x1, x2, x4], [x1, x3, x5], [x1, x2, x4, x6]]

    # Unlike the majority vote, here we expect to see x3 and x5 in the
    # annotations, because they come from a classifier of significantly
    # higher confidence.
    assert_equal(ensemble.vote(entity_matrix), [x1, x2, x3, x4, x5])

Esempio n. 20

0

Mostra file

def split_annotated_document(
        annotated_document, splitter=document_to_sentences):
    """ Splits an annotated document and maintains the annotation offsets.

        This function accepts an AnnotatedDocument object as parameter along
        with an optional tokenization method. It splits the document according
        to the tokenization method, and returns a list of AnnotatedDocument
        objects, where the annotation offsets have been adjusted.

        Args:
            annotated_document (AnnotatedDocument): The document that will be
                split into more documents.
            splitter: (func, optional): A function that accepts a string as
                input and returns a list of strings. Defaults to
                `document_to_sentences`, which is the default sentence splitter
                for this library.

        Returns:
            list(AnnotatedDocument): A list of annotated documents.
    """

    snippets = [
        snippet.strip() for snippet in
        splitter(annotated_document.plain_text_)]
    annotations = annotated_document.annotations

    cur_snippet_idx = 0
    cur_ann_idx = 0

    result_ann = []

    # Iterate every snippet of text and isolate its annotations.
    # Then construct a single AnnotatedDocument object with them.
    while cur_snippet_idx < len(snippets) and cur_ann_idx < len(annotations):
        cur_substring_idx = 0

        token_ann = []

        cur_snippet = snippets[cur_snippet_idx]

        cur_annotation_text = annotations[cur_ann_idx].text
        cur_annotation_label = annotations[cur_ann_idx].label
        idx_found = cur_snippet.find(cur_annotation_text, cur_substring_idx)
        # Iterate the annotations for as long as we keep finding them in
        # the current snippet of text.
        while idx_found != -1:
            cur_annotation_offsets = (
                idx_found, idx_found + len(cur_annotation_text) - 1)
            token_ann.append(Annotation(
                cur_annotation_text,
                cur_annotation_label,
                cur_annotation_offsets))

            cur_substring_idx = idx_found + len(cur_annotation_text)
            cur_ann_idx += 1

            if cur_ann_idx < len(annotations):
                cur_annotation_text = annotations[cur_ann_idx].text
                cur_annotation_label = annotations[cur_ann_idx].label
                idx_found = cur_snippet.find(
                    cur_annotation_text, cur_substring_idx)
            else:
                break

        result_ann.append(AnnotatedDocument(
            cur_snippet.encode(annotated_document.encoding),
            token_ann))
        cur_snippet_idx += 1

    return result_ann

Esempio n. 21

0

Mostra file

def transform_bio_tags_to_annotated_document(tokens, bio_tags, document):
    """ Given a list of tokens, a list of BIO tags, and a document object,
        this function returns annotated documents formed from this information.

        Example:
            doc -> "Barack Obama lives in the White House"
            tokens ->
            [['Barack', 'Obama', 'lives', 'in', 'the', 'White', 'House']]
            bio ->
            [['B_Person', 'I_Person', '0', '0', 'B_Institution',
            'I_Institution','I_Institution']]

        It returns:
        AnnotatedDocument(
        content = "Barack Obama lives in the White House"
        annotations = (
            (Barack Obama, Person, (0, 11))
            (White House, Person, (26, 36))
            )
        )
    """
    content = document.plain_text_

    cur_token_idx = 0
    cur_substring_idx = 0

    annotations = []
    while cur_token_idx < len(bio_tags):
        cur_token = tokens[cur_token_idx]
        cur_tag = bio_tags[cur_token_idx]

        if not cur_tag.startswith("B"):
            cur_substring_idx += len(cur_token)
            cur_token_idx += 1
            continue

        cur_label = cur_tag.split("_")[1]

        # Get the absolute start of the entity, given the index
        # which stores information about the previously detected
        # entity offset.
        start_idx = content.find(cur_token, cur_substring_idx)
        end_idx = start_idx + len(cur_token)

        if cur_token_idx + 1 < len(bio_tags):
            next_tag = bio_tags[cur_token_idx + 1]
            # If last word skip the following
            if next_tag.startswith("I"):
                while next_tag.startswith("I"):
                    cur_token_idx += 1
                    cur_token = tokens[cur_token_idx]
                    try:
                        next_tag = bio_tags[cur_token_idx + 1]
                    except IndexError:
                        break

                tmp_idx = content.find(cur_token, cur_substring_idx)
                # This line overwrites end_idx, in case there is a
                # multi-term annotation.
                end_idx = tmp_idx + len(cur_token)

        # Ends at the last character, and not after!
        idx_tuple = (start_idx, end_idx - 1)
        cur_substring_idx = end_idx

        annotations.append(Annotation(
            content[start_idx:end_idx],
            cur_label,
            idx_tuple))

        cur_token_idx += 1

    return AnnotatedDocument(
        document.content, annotations=annotations, encoding=document.encoding)

Esempio n. 22

0

Mostra file

def test_to_inline_string():
    annotation = Annotation("pizza", "FOOD", (1, 6))
    assert_equal(annotation.to_inline_string(), "FOOD[pizza]")

Esempio n. 23

0

Mostra file

def test_str():
    annotation = Annotation("pizza", "FOOD", (1, 6))
    assert_equal(str(annotation), "1,6 FOOD[pizza]")

Esempio n. 24

0

Mostra file

def test_hash():
    annotation_1 = Annotation("pizza", "FOOD", (1, 6))
    annotation_2 = Annotation("pizza", "FOOD", (1, 6))

    assert_equal(hash(annotation_1), hash(annotation_2))

Esempio n. 25

0

Mostra file

File: test_util_convert.py Progetto: hamroune/nerds

def test_split_annotated_document():
    content = (b"The quick brown fox jumps over the lazy dog. "
               b"Grumpy wizards make a toxic brew for the jovial queen.")
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42)),
        Annotation("wizards", "PERSON", (52, 58)),
        Annotation("brew", "DRINK", (73, 76)),
        Annotation("queen", "PERSON", (93, 97))
    ]
    annotated_document = AnnotatedDocument(content, annotations)
    result = split_annotated_document(annotated_document)

    assert_equal(result[0].content,
                 b"The quick brown fox jumps over the lazy dog.")
    assert_equal(result[1].content,
                 b"Grumpy wizards make a toxic brew for the jovial queen.")

    expected_annotations_doc1 = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    assert_equal(result[0].annotations, expected_annotations_doc1)

    expected_annotations_doc2 = [
        Annotation("wizards", "PERSON", (7, 13)),
        Annotation("brew", "DRINK", (28, 31)),
        Annotation("queen", "PERSON", (48, 52))
    ]
    assert_equal(result[1].annotations, expected_annotations_doc2)