def json_data(data): document = Document(data.get(u"name", u"_DOCUMENT_"), content=data.get(u"content", u"")) for key, value in data.get(u"metadatas", {}).items(): document.add_metadata(key, value) for segmentation_name in data.get(u"segmentations", {}): d = data[u"segmentations"][segmentation_name] spans = [ Span(lb=span[u"s"], ub=0, length=span[u"l"]) for span in d[u"spans"] ] segmentation = Segmentation(segmentation_name, spans=spans, reference=d.get(u"reference", None)) document.add_segmentation(segmentation) for segmentation in document.segmentations: if segmentation.reference is not None: segmentation.reference = document.segmentation( segmentation.reference) for annotation_name in data.get(u"annotations", {}): d = data[u"annotations"][annotation_name] annotations = [ Tag(lb=annotation[u"s"], ub=0, length=annotation[u"l"], value=annotation[u"v"]) for annotation in d[u"annotations"] ] annotation = Annotation(annotation_name, reference=document.segmentation( d[u"reference"]), annotations=annotations) document.add_annotation(annotation)
def brat_file(filename, encoding="utf-8"): no_ext, ext = os.path.splitext(filename) txt_file = no_ext + ".txt" ann_file = no_ext + ".ann" if not (os.path.exists(txt_file) and os.path.exists(ann_file)): raise ValueError("missing either .ann or .txt file") document = Document(os.path.basename(txt_file), encoding=encoding, mime_type="text/plain") document.content = codecs.open(txt_file, "rU", encoding).read().replace(u"\r", u"") annotations = Annotation("NER") for line in codecs.open(ann_file, "rU", encoding): line = line.strip() if line != u"" and line.startswith(u'T'): parts = line.split(u"\t") value, bounds = parts[1].split(" ", 1) for bound in bounds.split(";"): lb, ub = bound.split() lb = int(lb) ub = int(ub) annotations.append(Tag(lb=lb, ub=ub, value=value)) annotations.sort() document.add_annotation(annotations) return document
def main(indirnames, outfilename, default_shift=0, top_level=False): dirs = [] for indirname in indirnames: dirs.extend([ os.path.join(indirname, name) for name in sorted(os.listdir(indirname)) if os.path.isdir(os.path.join(indirname, name)) ]) contents = [] annotations = [] shift = 0 for dirname in dirs: cur_contents, cur_annotations, cur_shift = make_data( dirname, default_shift=shift, top_level=top_level) contents.extend(cur_contents) annotations.extend(cur_annotations) shift = cur_shift document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents)) document.add_annotation(Annotation("NER", annotations=annotations)) exporter = BratExporter() with codecs.open(outfilename + ".ann", "w", "utf-8") as O: O.write(exporter.document_to_unicode(document, {"ner": "NER"})) with codecs.open(outfilename + ".txt", "w", "utf-8") as O: O.write(document.content)
def main(indirname, outfilename, default_shift=0, top_level=False): contents, annotations, shift = make_data(indirname, default_shift=default_shift, top_level=top_level) document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents)) document.add_annotation(Annotation("NER", annotations=annotations)) exporter = BratExporter() with codecs.open(outfilename + ".ann", "w", "utf-8") as O: O.write(exporter.document_to_unicode(document, {"ner": "NER"})) with codecs.open(outfilename + ".txt", "w", "utf-8") as O: O.write(document.content)
def gate_data(data, name=None): document = Document(name or "__DOCUMENT__", mime_type="text/plain") textwithnodes = data.findall("TextWithNodes")[0] annotation_sets = data.findall("AnnotationSet") text_parts = [textwithnodes.text or u""] nodes = {} for node in list(textwithnodes): nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts]) text_parts.append(node.tail or u"") document.content = u"".join(text_parts) annotations = [] for annotation_set in annotation_sets: annotation_name = annotation_set.attrib["Name"] sem_annotation = Annotation(annotation_name) for annotation in annotation_set: lb = nodes[int(annotation.attrib["StartNode"])] ub = nodes[int(annotation.attrib["EndNode"])] sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"])) document.add_annotation(sem_annotation) return document