Exemple #1
0
def json_data(data):
    document = Document(data.get(u"name", u"_DOCUMENT_"),
                        content=data.get(u"content", u""))
    for key, value in data.get(u"metadatas", {}).items():
        document.add_metadata(key, value)

    for segmentation_name in data.get(u"segmentations", {}):
        d = data[u"segmentations"][segmentation_name]
        spans = [
            Span(lb=span[u"s"], ub=0, length=span[u"l"])
            for span in d[u"spans"]
        ]
        segmentation = Segmentation(segmentation_name,
                                    spans=spans,
                                    reference=d.get(u"reference", None))
        document.add_segmentation(segmentation)
    for segmentation in document.segmentations:
        if segmentation.reference is not None:
            segmentation.reference = document.segmentation(
                segmentation.reference)

    for annotation_name in data.get(u"annotations", {}):
        d = data[u"annotations"][annotation_name]
        annotations = [
            Tag(lb=annotation[u"s"],
                ub=0,
                length=annotation[u"l"],
                value=annotation[u"v"]) for annotation in d[u"annotations"]
        ]
        annotation = Annotation(annotation_name,
                                reference=document.segmentation(
                                    d[u"reference"]),
                                annotations=annotations)
        document.add_annotation(annotation)
Exemple #2
0
def brat_file(filename, encoding="utf-8"):
    no_ext, ext = os.path.splitext(filename)
    txt_file = no_ext + ".txt"
    ann_file = no_ext + ".ann"
    if not (os.path.exists(txt_file) and os.path.exists(ann_file)):
        raise ValueError("missing either .ann or .txt file")

    document = Document(os.path.basename(txt_file),
                        encoding=encoding,
                        mime_type="text/plain")
    document.content = codecs.open(txt_file, "rU",
                                   encoding).read().replace(u"\r", u"")
    annotations = Annotation("NER")
    for line in codecs.open(ann_file, "rU", encoding):
        line = line.strip()
        if line != u"" and line.startswith(u'T'):
            parts = line.split(u"\t")
            value, bounds = parts[1].split(" ", 1)
            for bound in bounds.split(";"):
                lb, ub = bound.split()
                lb = int(lb)
                ub = int(ub)
                annotations.append(Tag(lb=lb, ub=ub, value=value))
    annotations.sort()
    document.add_annotation(annotations)

    return document
def main(indirnames, outfilename, default_shift=0, top_level=False):
    dirs = []
    for indirname in indirnames:
        dirs.extend([
            os.path.join(indirname, name)
            for name in sorted(os.listdir(indirname))
            if os.path.isdir(os.path.join(indirname, name))
        ])

    contents = []
    annotations = []
    shift = 0
    for dirname in dirs:
        cur_contents, cur_annotations, cur_shift = make_data(
            dirname, default_shift=shift, top_level=top_level)
        contents.extend(cur_contents)
        annotations.extend(cur_annotations)
        shift = cur_shift

    document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents))
    document.add_annotation(Annotation("NER", annotations=annotations))
    exporter = BratExporter()
    with codecs.open(outfilename + ".ann", "w", "utf-8") as O:
        O.write(exporter.document_to_unicode(document, {"ner": "NER"}))
    with codecs.open(outfilename + ".txt", "w", "utf-8") as O:
        O.write(document.content)
Exemple #4
0
def main(indirname, outfilename, default_shift=0, top_level=False):
    contents, annotations, shift = make_data(indirname,
                                             default_shift=default_shift,
                                             top_level=top_level)

    document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents))
    document.add_annotation(Annotation("NER", annotations=annotations))
    exporter = BratExporter()
    with codecs.open(outfilename + ".ann", "w", "utf-8") as O:
        O.write(exporter.document_to_unicode(document, {"ner": "NER"}))
    with codecs.open(outfilename + ".txt", "w", "utf-8") as O:
        O.write(document.content)
Exemple #5
0
def gate_data(data, name=None):
    document = Document(name or "__DOCUMENT__", mime_type="text/plain")

    textwithnodes = data.findall("TextWithNodes")[0]
    annotation_sets = data.findall("AnnotationSet")

    text_parts = [textwithnodes.text or u""]
    nodes = {}
    for node in list(textwithnodes):
        nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts])
        text_parts.append(node.tail or u"")
    document.content = u"".join(text_parts)

    annotations = []
    for annotation_set in annotation_sets:
        annotation_name = annotation_set.attrib["Name"]
        sem_annotation = Annotation(annotation_name)
        for annotation in annotation_set:
            lb = nodes[int(annotation.attrib["StartNode"])]
            ub = nodes[int(annotation.attrib["EndNode"])]
            sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"]))
        document.add_annotation(sem_annotation)

    return document