Esempio n. 1
0
def json_data(data):
    document = Document(data.get(u"name", u"_DOCUMENT_"),
                        content=data.get(u"content", u""))
    for key, value in data.get(u"metadatas", {}).items():
        document.add_metadata(key, value)

    for segmentation_name in data.get(u"segmentations", {}):
        d = data[u"segmentations"][segmentation_name]
        spans = [
            Span(lb=span[u"s"], ub=0, length=span[u"l"])
            for span in d[u"spans"]
        ]
        segmentation = Segmentation(segmentation_name,
                                    spans=spans,
                                    reference=d.get(u"reference", None))
        document.add_segmentation(segmentation)
    for segmentation in document.segmentations:
        if segmentation.reference is not None:
            segmentation.reference = document.segmentation(
                segmentation.reference)

    for annotation_name in data.get(u"annotations", {}):
        d = data[u"annotations"][annotation_name]
        annotations = [
            Tag(lb=annotation[u"s"],
                ub=0,
                length=annotation[u"l"],
                value=annotation[u"v"]) for annotation in d[u"annotations"]
        ]
        annotation = Annotation(annotation_name,
                                reference=document.segmentation(
                                    d[u"reference"]),
                                annotations=annotations)
        document.add_annotation(annotation)
Esempio n. 2
0
def conll_file(filename, fields, word_field, encoding="utf-8"):
    document = Document(os.path.basename(filename), encoding=encoding)
    document._corpus = Corpus.from_conll(filename, fields, encoding=encoding)
    character_index = 0
    sentence_index = 0
    contents = []
    word_spans = []
    sentence_spans = []
    for sentence in document._corpus.sentences:
        contents.append([])
        for token in sentence:
            word = token[word_field]
            contents[-1].append(word)
            word_spans.append(
                Span(character_index, character_index + len(word)))
            character_index += len(word) + 1
        sentence_spans.append(
            Span(sentence_index, sentence_index + len(sentence)))
        sentence_index += len(sentence)
    document._content = u"\n".join(
        [u" ".join(content) for content in contents])
    document.add_segmentation(Segmentation("tokens", spans=word_spans))
    document.add_segmentation(
        Segmentation("sentences",
                     reference=document.segmentation("tokens"),
                     spans=sentence_spans))
    return document