Ejemplo n.º 1
0
def test_optimizer():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # Test it with CRF because it's the least time-consuming one to train.
    crf = CRF()
    hparams = {
        "c1": ExactListParam([0.1, 0.9]),
        "c2": ExactListParam([0.1, 0.9])
    }
    optimizer = Optimizer(crf, hparams, "COLOR", cv=3)
    best_estimator, f1score = optimizer.optimize_and_return_best([
        annotated_document_1, annotated_document_2, annotated_document_3
    ])

    assert_greater_equal(f1score, 0.5)
Ejemplo n.º 2
0
def test_transform_annotated_documents_to_bio_format():

    # Test no annotations.
    content = b"The quick brown fox jumps over the lazy dog."
    annotated_document = AnnotatedDocument(content, annotations=None)

    expected = ([[
        'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog',
        '.'
    ]], [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])

    transformed = transform_annotated_documents_to_bio_format(
        [annotated_document])
    assert_equal(transformed, expected)

    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document = AnnotatedDocument(content, annotations=annotations)
    expected = ([[
        'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog',
        '.'
    ]], [[
        'O', 'O', 'B_COLOR', 'B_ANIMAL', 'O', 'O', 'O', 'O', 'B_ANIMAL', 'O'
    ]])

    transformed = transform_annotated_documents_to_bio_format(
        [annotated_document])
    assert_equal(transformed, expected)
Ejemplo n.º 3
0
def test_kfold_cv():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # Test it with CRF because it's the least time-consuming one to train.
    crf = CRF()
    kfold = KFoldCV(crf, k=3)
    average_f1 = kfold.cross_validate(
        [annotated_document_1, annotated_document_2, annotated_document_3],
        {"max_iterations": 100})

    # The examples and k are selected in a way where this always happens.
    assert_equal(average_f1, 0.5)
Ejemplo n.º 4
0
def test_split_annotated_document():
    content = (b"The quick brown fox jumps over the lazy dog. "
               b"Grumpy wizards make a toxic brew for the jovial queen.")
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42)),
        Annotation("wizards", "PERSON", (52, 58)),
        Annotation("brew", "DRINK", (73, 76)),
        Annotation("queen", "PERSON", (93, 97))
    ]
    annotated_document = AnnotatedDocument(content, annotations)
    result = split_annotated_document(annotated_document)

    assert_equal(result[0].content,
                 b"The quick brown fox jumps over the lazy dog.")
    assert_equal(result[1].content,
                 b"Grumpy wizards make a toxic brew for the jovial queen.")

    expected_annotations_doc1 = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    assert_equal(result[0].annotations, expected_annotations_doc1)

    expected_annotations_doc2 = [
        Annotation("wizards", "PERSON", (7, 13)),
        Annotation("brew", "DRINK", (28, 31)),
        Annotation("queen", "PERSON", (48, 52))
    ]
    assert_equal(result[1].annotations, expected_annotations_doc2)
Ejemplo n.º 5
0
def test_crf_multi_term():
    content = b"The dark brown fox jumps over the lazy dog magnificently."
    annotations = [
        Annotation("dark brown", "COLOR", (4, 13)),
        Annotation("fox", "ANIMAL", (15, 17)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    # Train.
    mod = CRF()
    mod.fit([annotated_document])

    # Predict. Works!
    content = b"The dark brown fox."
    document = Document(content)

    ann = mod.transform([document])

    # 2 annotations, dark brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "dark brown")
    assert_equal(ann[0].annotations[0].offset, (4, 13))
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].offset, (15, 17))
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
Ejemplo n.º 6
0
def test_crf():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    # Train.
    mod = CRF()
    mod.fit([annotated_document])

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = mod.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))
Ejemplo n.º 7
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            In a dictionary based approach, a dictionary of keywords is used
            to create a FSA which is then used to search with. See [1].
            [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
        """
        annotated_documents = []
        for document in X:
            annotations = []
            doc_content_str = document.plain_text_
            for item in self.automaton.iter(doc_content_str):
                end_position, (index, word) = item

                start_position = (end_position - len(word) + 1)
                end_position = end_position + 1

                annotations.append(
                    Annotation(word, self.entity_label,
                               (start_position, end_position)))

            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  annotations=annotations,
                                  encoding=document.encoding))

        return annotated_documents
Ejemplo n.º 8
0
    def transform(self, X=None, y=None):
        """ Transforms the available documents into the appropriate objects,
            differentiating on the `annotated` parameter.
        """

        # If not annotated, fall back to base class and simply read files
        if not self.annotated:
            return super().transform(X, y)
        # Else, read txt/ann
        else:
            annotated_docs = []
            for found in os.listdir(self.path):
                f = join(self.path, found)

                if f.endswith(".txt") and isfile(f):
                    # Standard brat folder structure:
                    # For every txt there should be an ann.
                    brat_f = f.replace(".txt", ".ann")
                    annotations = self._read_brat_ann_file(brat_f)

                    with open(f, 'rb') as doc_file:
                        annotated_docs.append(AnnotatedDocument(
                            doc_file.read(),
                            annotations,
                            self.encoding))

            return annotated_docs
Ejemplo n.º 9
0
def test_spacy_multi_term():
    content = b"The quick dark brown fox jumps over the lazy dog."
    annotations = [
        Annotation("dark brown", "COLOR", (10, 19)),
        Annotation("fox", "ANIMAL", (21, 23)),
        Annotation("dog", "ANIMAL", (45, 47))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A dark brown fox jumps quickly."
    annotations = [
        Annotation("dark brown", "COLOR", (2, 11)),
        Annotation("fox", "ANIMAL", (13, 15))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was dark brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("dark brown", "COLOR", (17, 26)),
        Annotation("dog", "ANIMAL", (41, 43))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    data = [annotated_document_1, annotated_document_2, annotated_document_3]

    # Train.
    model = SpaCyStatisticalNER()
    model.fit(data, num_epochs=5)

    # Predict. Works!
    content = b"The dark brown fox."
    document = Document(content)

    ann = model.transform([document])

    # 2 annotations, dark brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "dark brown")
    assert_equal(ann[0].annotations[0].offset, (4, 13))
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].offset, (15, 17))
    assert_equal(ann[0].annotations[1].label, "ANIMAL")

    entities = model.extract([document])
    assert_equal(len(entities[0]), 2)
Ejemplo n.º 10
0
def test_spacy():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))
    ]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))
    ]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    data = [annotated_document_1, annotated_document_2, annotated_document_3]

    # Train.
    model = SpaCyStatisticalNER()
    model.fit(data, num_epochs=5)

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = model.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))

    entities = model.extract([document])
    assert_equal(len(entities[0]), 2)
Ejemplo n.º 11
0
def test_ner_ensemble_configuration():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))]
    annotated_document_1 = AnnotatedDocument(content, annotations=annotations)

    content = b"A brown fox jumps quickly."
    annotations = [
        Annotation("brown", "COLOR", (2, 6)),
        Annotation("fox", "ANIMAL", (8, 10))]
    annotated_document_2 = AnnotatedDocument(content, annotations=annotations)

    content = b"The fox that was brown jumps over a dog that was lazy."
    annotations = [
        Annotation("fox", "ANIMAL", (4, 6)),
        Annotation("brown", "COLOR", (17, 21)),
        Annotation("dog", "ANIMAL", (36, 38))]
    annotated_document_3 = AnnotatedDocument(content, annotations=annotations)

    # In that config file we have params for a CRF with c1 = 0.1 and c2 = 0.1.
    ner_ensemble_config = NERModelEnsembleConfiguration(
        "nerds/test/data/config/sample.yaml")
    ner_ensemble_config.fit([
        annotated_document_1,
        annotated_document_2,
        annotated_document_3])

    # Predict. Works!
    content = b"The quick brown fox."
    document = Document(content)

    ann = ner_ensemble_config.transform([document])

    # 2 annotations, brown and fox
    assert_equal(len(ann[0].annotations), 2)
    assert_equal(ann[0].annotations[0].text, "brown")
    assert_equal(ann[0].annotations[0].label, "COLOR")
    assert_equal(ann[0].annotations[0].offset, (10, 14))
    assert_equal(ann[0].annotations[1].text, "fox")
    assert_equal(ann[0].annotations[1].label, "ANIMAL")
    assert_equal(ann[0].annotations[1].offset, (16, 18))
Ejemplo n.º 12
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            The basic implementation of this method does not annotate any
            entities and should be overridden by offspring.
        """
        annotated_documents = []
        for document in X:
            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  encoding=document.encoding))
        return annotated_documents
Ejemplo n.º 13
0
def test_transform_to_spacy_format():
    content = b"The quick brown fox jumps over the lazy dog."
    annotations = [
        Annotation("brown", "COLOR", (10, 14)),
        Annotation("fox", "ANIMAL", (16, 18)),
        Annotation("dog", "ANIMAL", (40, 42))
    ]
    annotated_document = AnnotatedDocument(content, annotations=annotations)

    expected = [("The quick brown fox jumps over the lazy dog.", {
        "entities": [(10, 15, "COLOR"), (16, 19, "ANIMAL"), (40, 43, "ANIMAL")]
    })]

    model = SpaCyStatisticalNER()
    transformed = model._transform_to_spacy_format([annotated_document])

    assert_equal(transformed, expected)
Ejemplo n.º 14
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            Needs an implementation of the `vote` method.
        """
        annotated_entities_per_model = []
        for model in self.models:
            annotated_entities_per_model.append(model.extract(X, y))

        annotated_documents = []
        for doc_idx, document in enumerate(X):
            entity_matrix = np.array(
                annotated_entities_per_model)[:, doc_idx].tolist()
            annotated_documents.append(
                AnnotatedDocument(document.content, self.vote(entity_matrix),
                                  document.encoding))
        return annotated_documents
Ejemplo n.º 15
0
 def transform(self, X, y=None):
     """ Annotates the list of `Document` objects that are provided as
         input and returns a list of `AnnotatedDocument` objects.
     """
     annotated_documents = []
     for document in X:
         annotated_document = self.nlp(document.plain_text_)
         annotations = []
         for named_entity in annotated_document.ents:
             annotations.append(
                 Annotation(
                     named_entity.text, named_entity.label_,
                     (named_entity.start_char, named_entity.end_char - 1)))
         annotated_documents.append(
             AnnotatedDocument(document.content,
                               annotations=annotations,
                               encoding=document.encoding))
     return annotated_documents
Ejemplo n.º 16
0
def test_ExactMatchMultiClassDictionaryNER2():
    documents = [
        AnnotatedDocument(b"""
        In this study , we have used the polymerase chain reaction ( PCR ) with nested 
        primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils 
        from female patients with eosinophilia .
        """, annotations= [
            Annotation("HUMARA loci", "DNA", (139, 150)),
            Annotation("purified eosinophils", "cell-type", (154, 174))
        ])]
    ner = ExactMatchMultiClassDictionaryNER(
        "nerds/test/data/dictionary/biodictionary.txt")
    ner.fit(documents)
    pred_documents = ner.transform(documents)
    for i, annotation in enumerate(pred_documents[0].annotations):
        pred_text = annotation.text
        pred_offsets = annotation.offset
        label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]]
        assert_equal(pred_text, label_text, 
            "predicted {:s} != label {:s}".format(pred_text, label_text))
        assert_equal(annotation.label, expected_labels[i])
Ejemplo n.º 17
0
 def transform(self, X, y=None):
     """ Annotates the list of `Document` objects that are provided as
         input and returns a list of `AnnotatedDocument` objects.
     """
     annotated_documents = []
     for document in X:
         content = sentence_to_tokens(document.plain_text_)
         output = self.model.analyze(content)
         substring_index = 0
         annotations = []
         for entity in output["entities"]:
             start_idx, end_idx = _get_offsets_with_fuzzy_matching(
                 document.plain_text_, entity["text"], substring_index)
             offset = (start_idx, end_idx - 1)
             annotations.append(
                 Annotation(document.plain_text_[start_idx:end_idx],
                            self._label_map[entity["type"]], offset))
             substring_index = end_idx
         annotated_documents.append(
             AnnotatedDocument(document.content,
                               annotations=annotations,
                               encoding=document.encoding))
     return annotated_documents
Ejemplo n.º 18
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.

            In a dictionary based approach, a dictionary of keywords is used
            to create a FSA which is then used to search with. See [1].
            [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
        """
        annotated_documents = []
        for document in X:
            annotations = []
            doc_content_str = document.plain_text_
            for item in self.automaton.iter(doc_content_str):
                end_position, (label, word) = item

                start_position = (end_position - len(word) + 1)
                end_position = end_position + 1

                # Aho-Corasick matches partial strings in the input document, which
                # leads to spurious matches, so we check to see that the match spans
                # a full word before adding it to our list of valid annotations
                if ((start_position <= 0
                     and doc_content_str[end_position] == " ")
                        or (end_position >= len(doc_content_str)
                            and doc_content_str[start_position - 1] == " ")
                        or (doc_content_str[start_position - 1] == " "
                            and doc_content_str[end_position] == " ")):
                    annotations.append(
                        Annotation(word, label,
                                   (start_position, end_position)))

            annotated_documents.append(
                AnnotatedDocument(document.content,
                                  annotations=annotations,
                                  encoding=document.encoding))

        return annotated_documents
Ejemplo n.º 19
0
def split_annotated_document(
        annotated_document, splitter=document_to_sentences):
    """ Splits an annotated document and maintains the annotation offsets.

        This function accepts an AnnotatedDocument object as parameter along
        with an optional tokenization method. It splits the document according
        to the tokenization method, and returns a list of AnnotatedDocument
        objects, where the annotation offsets have been adjusted.

        Args:
            annotated_document (AnnotatedDocument): The document that will be
                split into more documents.
            splitter: (func, optional): A function that accepts a string as
                input and returns a list of strings. Defaults to
                `document_to_sentences`, which is the default sentence splitter
                for this library.

        Returns:
            list(AnnotatedDocument): A list of annotated documents.
    """

    snippets = [
        snippet.strip() for snippet in
        splitter(annotated_document.plain_text_)]
    annotations = annotated_document.annotations

    cur_snippet_idx = 0
    cur_ann_idx = 0

    result_ann = []

    # Iterate every snippet of text and isolate its annotations.
    # Then construct a single AnnotatedDocument object with them.
    while cur_snippet_idx < len(snippets) and cur_ann_idx < len(annotations):
        cur_substring_idx = 0

        token_ann = []

        cur_snippet = snippets[cur_snippet_idx]

        cur_annotation_text = annotations[cur_ann_idx].text
        cur_annotation_label = annotations[cur_ann_idx].label
        idx_found = cur_snippet.find(cur_annotation_text, cur_substring_idx)
        # Iterate the annotations for as long as we keep finding them in
        # the current snippet of text.
        while idx_found != -1:
            cur_annotation_offsets = (
                idx_found, idx_found + len(cur_annotation_text) - 1)
            token_ann.append(Annotation(
                cur_annotation_text,
                cur_annotation_label,
                cur_annotation_offsets))

            cur_substring_idx = idx_found + len(cur_annotation_text)
            cur_ann_idx += 1

            if cur_ann_idx < len(annotations):
                cur_annotation_text = annotations[cur_ann_idx].text
                cur_annotation_label = annotations[cur_ann_idx].label
                idx_found = cur_snippet.find(
                    cur_annotation_text, cur_substring_idx)
            else:
                break

        result_ann.append(AnnotatedDocument(
            cur_snippet.encode(annotated_document.encoding),
            token_ann))
        cur_snippet_idx += 1

    return result_ann
Ejemplo n.º 20
0
def transform_bio_tags_to_annotated_document(tokens, bio_tags, document):
    """ Given a list of tokens, a list of BIO tags, and a document object,
        this function returns annotated documents formed from this information.

        Example:
            doc -> "Barack Obama lives in the White House"
            tokens ->
            [['Barack', 'Obama', 'lives', 'in', 'the', 'White', 'House']]
            bio ->
            [['B_Person', 'I_Person', '0', '0', 'B_Institution',
            'I_Institution','I_Institution']]

        It returns:
        AnnotatedDocument(
        content = "Barack Obama lives in the White House"
        annotations = (
            (Barack Obama, Person, (0, 11))
            (White House, Person, (26, 36))
            )
        )
    """
    content = document.plain_text_

    cur_token_idx = 0
    cur_substring_idx = 0

    annotations = []
    while cur_token_idx < len(bio_tags):
        cur_token = tokens[cur_token_idx]
        cur_tag = bio_tags[cur_token_idx]

        if not cur_tag.startswith("B"):
            cur_substring_idx += len(cur_token)
            cur_token_idx += 1
            continue

        cur_label = cur_tag.split("_")[1]

        # Get the absolute start of the entity, given the index
        # which stores information about the previously detected
        # entity offset.
        start_idx = content.find(cur_token, cur_substring_idx)
        end_idx = start_idx + len(cur_token)

        if cur_token_idx + 1 < len(bio_tags):
            next_tag = bio_tags[cur_token_idx + 1]
            # If last word skip the following
            if next_tag.startswith("I"):
                while next_tag.startswith("I"):
                    cur_token_idx += 1
                    cur_token = tokens[cur_token_idx]
                    try:
                        next_tag = bio_tags[cur_token_idx + 1]
                    except IndexError:
                        break

                tmp_idx = content.find(cur_token, cur_substring_idx)
                # This line overwrites end_idx, in case there is a
                # multi-term annotation.
                end_idx = tmp_idx + len(cur_token)

        # Ends at the last character, and not after!
        idx_tuple = (start_idx, end_idx - 1)
        cur_substring_idx = end_idx

        annotations.append(Annotation(
            content[start_idx:end_idx],
            cur_label,
            idx_tuple))

        cur_token_idx += 1

    return AnnotatedDocument(
        document.content, annotations=annotations, encoding=document.encoding)