def json_data(data): document = Document(data.get(u"name", u"_DOCUMENT_"), content=data.get(u"content", u"")) for key, value in data.get(u"metadatas", {}).items(): document.add_metadata(key, value) for segmentation_name in data.get(u"segmentations", {}): d = data[u"segmentations"][segmentation_name] spans = [ Span(lb=span[u"s"], ub=0, length=span[u"l"]) for span in d[u"spans"] ] segmentation = Segmentation(segmentation_name, spans=spans, reference=d.get(u"reference", None)) document.add_segmentation(segmentation) for segmentation in document.segmentations: if segmentation.reference is not None: segmentation.reference = document.segmentation( segmentation.reference) for annotation_name in data.get(u"annotations", {}): d = data[u"annotations"][annotation_name] annotations = [ Tag(lb=annotation[u"s"], ub=0, length=annotation[u"l"], value=annotation[u"v"]) for annotation in d[u"annotations"] ] annotation = Annotation(annotation_name, reference=document.segmentation( d[u"reference"]), annotations=annotations) document.add_annotation(annotation)
def conll_file(filename, fields, word_field, encoding="utf-8"): document = Document(os.path.basename(filename), encoding=encoding) document._corpus = Corpus.from_conll(filename, fields, encoding=encoding) character_index = 0 sentence_index = 0 contents = [] word_spans = [] sentence_spans = [] for sentence in document._corpus.sentences: contents.append([]) for token in sentence: word = token[word_field] contents[-1].append(word) word_spans.append( Span(character_index, character_index + len(word))) character_index += len(word) + 1 sentence_spans.append( Span(sentence_index, sentence_index + len(sentence))) sentence_index += len(sentence) document._content = u"\n".join( [u" ".join(content) for content in contents]) document.add_segmentation(Segmentation("tokens", spans=word_spans)) document.add_segmentation( Segmentation("sentences", reference=document.segmentation("tokens"), spans=sentence_spans)) return document