Example #1
0
def write_formatted(out, annotations_columns, annotations_structs, columns, structs, structs_count, text):
    """
    The 'formatted' XML part of the 'export' function: export xml with the same
    whitespace and indentation as in the original.
    """
    txt, anchor2pos, pos2anchor = util.corpus.read_corpus_text(text)
    structs_order = ["__token__"] + [s[0] for s in structs]
    anchors = defaultdict(dict)
    for elem, attrs in structs:
        for attr in attrs:
            struct = util.read_annotation(annotations_structs[attr[1]][0])
            for edge in struct:
                if util.edgeStart(edge) == util.edgeEnd(edge):
                    anchors[util.edgeStart(edge)].setdefault("structs", {}).setdefault((elem, anchor2pos[util.edgeEnd(edge)], "close"), []).append((attr[0], struct[edge]))
                else:
                    anchors[util.edgeStart(edge)].setdefault("structs", {}).setdefault((elem, anchor2pos[util.edgeEnd(edge)]), []).append((attr[0], struct[edge]))
                    anchors[util.edgeEnd(edge)].setdefault("close", set()).add((elem, edge))
    for n, annot in enumerate(annotations_columns):
        n += structs_count
        for tok, value in util.read_annotation_iteritems(annot):
            if n > structs_count:  # Any column except the first (the word)
                value = "|" if value == "|/|" else value
            anchors[util.edgeStart(tok)].setdefault("token", []).append(value.replace("\n", " "))
            if n == structs_count:
                anchors[util.edgeEnd(tok)].setdefault("close", set()).add(("__token__", None))
    currpos = 0

    with open(out, "w") as OUT:
        OUT.write("<corpus>")
        for pos, anchor in sorted(list(pos2anchor.items()), key=lambda x: x[0]):
            OUT.write(txt[currpos:pos].replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"))
            if anchor in anchors:
                if "close" in anchors[anchor]:
                    if ("__token__", None) in anchors[anchor]["close"]:
                        OUT.write("</w>")
                    OUT.write("".join("</%s>" % e[0] for e in sorted(anchors[anchor]["close"], key=lambda x: structs_order.index(x[0])) if not e[0] == "__token__"))

                if "structs" in anchors[anchor]:
                    for elem, annot in sorted(iter(list(anchors[anchor]["structs"].items())), key=lambda x: (-x[0][1], -structs_order.index(x[0][0]))):
                        if elem not in ("close", "token"):
                            attrstring = "".join(' %s="%s"' % (attr, val.replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;"))
                                                 for (attr, val) in annot if val and not attr == UNDEF)
                            close = "/" if len(elem) == 3 else ""
                            OUT.write("<%s%s%s>" % (elem[0], attrstring, close))

                if "token" in anchors[anchor]:
                    attrstring = "".join(' %s="%s"' % (columns[i + 1], a.replace("&", "&amp;").replace('"', '&quot;').replace("<", "&lt;").replace(">", "&gt;"))
                                         for i, a in enumerate(anchors[anchor]["token"][1:]) if a)
                    OUT.write("<w%s>" % attrstring)

            currpos = pos
        OUT.write("</corpus>")
    util.log.info("Exported: %s", out)
def align_texts(sentence1, sentence2, link1, link2, sent_parents1, sent_parents2, out_sentlink1, out_sentlink2):
    """Make a more fine-grained sentence alignment between the current text (1) and a parallel reference text (2).
    - sentence1 and sentence2 contain information about which word-IDs there are in each sentence
    - link1 and link2 are existing annotations for the link IDs in the two texts
    - linkref2 is the existing annotation for the linkref IDs in text 2
    - sent_parents1 and sent_parents2 contain information about which sentences there are in each of the old sentence links
    - out_sentlink1 and out_sentlink2, are the resulting annotations for the new sentence links
    """

    REVERSED_LINK2 = {v: k for k, v in list(util.read_annotation(link2).items())}
    SENTPARENTS1 = util.read_annotation(sent_parents1)
    SENTPARENTS2 = util.read_annotation(sent_parents2)
    SENT1 = util.read_annotation(sentence1)
    SENT2 = util.read_annotation(sentence2)

    OUT_SENTLINK1 = {}
    OUT_SENTLINK2 = {}

    linkcounter = 0

    # Loop through existing links and split them into smaller units if possible (only if both links have text)
    for linkkey1, linkid in util.read_annotation_iteritems(link1):
        linkkey2 = REVERSED_LINK2[linkid]
        if linkkey1 in SENTPARENTS1 and linkkey2 in SENTPARENTS2:
            linkedsents1 = []
            linkedsents2 = []
            for sentid in SENTPARENTS1[linkkey1].split():
                linkedsents1.append((sentid, [w for w in SENT1[sentid].split()]))
            for sentid in SENTPARENTS2[linkkey2].split():
                linkedsents2.append((sentid, [w for w in SENT2[sentid].split()]))

            for s1, s2 in gachalign(linkedsents1, linkedsents2, mean="gacha"):
                linkcounter += 1
                if s1:
                    newlink1 = util.mkEdge('link', [util.edgeStart(s1[0]), util.edgeEnd(s1[-1])])
                    OUT_SENTLINK1[newlink1] = str(linkcounter)

                if s2:
                    newlink2 = util.mkEdge('link', [util.edgeStart(s2[0]), util.edgeEnd(s2[-1])])
                    OUT_SENTLINK2[newlink2] = str(linkcounter)

        # annotation if a link has text in one language but is empty in the other one
        elif linkkey1 in SENTPARENTS1 or linkkey2 in SENTPARENTS2:
            linkcounter += 1
            newlink1 = util.mkEdge('link', [util.edgeStart(linkkey1), util.edgeEnd(linkkey1)])
            OUT_SENTLINK1[newlink1] = str(linkcounter)
            newlink2 = util.mkEdge('link', [util.edgeStart(linkkey2), util.edgeEnd(linkkey2)])
            OUT_SENTLINK2[newlink2] = str(linkcounter)

    util.write_annotation(out_sentlink1, OUT_SENTLINK1)
    util.write_annotation(out_sentlink2, OUT_SENTLINK2)
Example #3
0
 def order(chunknr, edge, _value):
     value = anchors[chunknr][util.edgeStart(edge)]  # Position in corpus
     return (chunknr, value)
Example #4
0
def run_wsd(wsdjar, sense_model, context_model, out, sentence, word, ref, lemgram, saldo, pos, text,
            sensefmt=util.SCORESEP + "%.3f", default_prob="-1", encoding=util.UTF8):
    """
    Runs the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation.
    Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob.
      - wsdjar is the name of the java programme to be used for the wsd
      - sense_model and context_model are the models to be used with wsdjar
      - out is the resulting annotation file
      - sentence is an existing annotation for sentences and their children (words)
      - word is an existing annotations for wordforms
      - ref is an existing annotation for word references
      - lemgram and saldo are existing annotations for inflection tables and meanings
      - pos is an existing annotations for part-of-speech
      - text is an existing file with the input text and its anchors.
      - sensefmt is a format string for how to print the sense and its probability
      - default_prob is the default value for unanalyzed senses
    """

    WORD = util.read_annotation(word)
    REF = util.read_annotation(ref)
    LEMGRAM = util.read_annotation(lemgram)
    SALDO = util.read_annotation(saldo)
    POS = util.read_annotation(pos)
    textpos = util.read_corpus_text(text)[1]

    # Sort sentences according to their text position because WSD is context dependent.
    sentences = sorted(util.read_annotation_iteritems(sentence), key=lambda x: textpos[util.edgeStart(x[0])])
    sentences = [sent.split() for _, sent in sentences]

    # Start WSD process
    process = wsd_start(wsdjar, sense_model, context_model, encoding)

    # Construct input and send to WSD
    stdin = build_input(sentences, WORD, REF, LEMGRAM, SALDO, POS)
    if encoding:
        stdin = stdin.encode(encoding)

    stdout, stderr = process.communicate(stdin)
    # TODO: Solve hack line below!
    # Problem is that regular messages "Reading sense vectors.." are also piped to stderr.
    if len(stderr) > 52:
        util.system.kill_process(process)
        util.log.error(str(stderr))
        return

    if encoding:
        stdout = stdout.decode(encoding)

    process_output(out, stdout, sentences, SALDO, sensefmt, default_prob)

    # Kill running subprocess
    util.system.kill_process(process)
    return
Example #5
0
def parse_swener_output(sentences, output, out_ne_ex, out_ne_type,
                        out_ne_subtype, out_ne_name):
    """Parse the SweNER output and write annotation files."""

    out_ex_dict = {}
    out_type_dict = {}
    out_subtype_dict = {}
    out_name_dict = {}

    # Loop through the NE-tagged sentences and parse each one with ElemenTree
    for sent, tagged_sent in zip(sentences, output.strip().split(SENT_SEP)):
        xml_sent = "<sroot>" + tagged_sent + "</sroot>"

        # Filter out tags on the format <EnamexXxxXxx> since they seem to always overlap with <ENAMEX> elements,
        # making the XML invalid.
        xml_sent = re.sub(r'</?Enamex[^>\s]+>', '', xml_sent)
        try:
            root = etree.fromstring(xml_sent)
        except:
            util.log.warning("Error parsing sentence. Skipping.")
            continue

        # Init token counter; needed to get start_id and end_id
        i = 0
        previous_end = 0
        children = list(root.iter())

        try:

            for count, child in enumerate(children):
                start_id = util.edgeStart(sent[i])
                start_i = i

                # If current child has text, increase token counter
                if child.text:
                    i += len(child.text.strip().split(TOK_SEP))

                    # Extract NE tags and save them in dictionaries
                    if child.tag != "sroot":
                        if start_i < previous_end:
                            pass
                            # util.log.warning("Overlapping NE elements found; discarding one.")
                        else:
                            end_id = util.edgeEnd(sent[i - 1])
                            previous_end = i
                            edge = util.mkEdge('ne', [start_id, end_id])
                            out_ex_dict[edge] = child.tag
                            out_type_dict[edge] = child.get("TYPE")
                            out_subtype_dict[edge] = child.get("SBT")
                            out_name_dict[edge] = child.text

                        # If this child has a tail and it doesn't start with a space, or if it has no tail at all despite not being the last child,
                        # it means this NE ends in the middle of a token.
                        if (child.tail and child.tail.strip()
                                and not child.tail[0] == " ") or (
                                    not child.tail
                                    and count < len(children) - 1):
                            i -= 1
                            # util.log.warning("Split token returned by name tagger.")

                # If current child has text in the tail, increase token counter
                if child.tail and child.tail.strip():
                    i += len(child.tail.strip().split(TOK_SEP))

                if (child.tag == "sroot" and child.text and not child.text[-1]
                        == " ") or (child.tail and not child.tail[-1] == " "):
                    # The next NE would start in the middle of a token, so decrease the counter by 1
                    i -= 1
        except IndexError:
            util.log.warning("Error parsing sentence. Skipping.")
            continue

    # Write annotations
    util.write_annotation(out_ne_ex, out_ex_dict)
    util.write_annotation(out_ne_type, out_type_dict)
    util.write_annotation(out_ne_subtype, out_subtype_dict)
    util.write_annotation(out_ne_name, out_name_dict)
Example #6
0
 def make_span(edge):
     return slice(anchor2pos[util.edgeStart(edge)], anchor2pos[util.edgeEnd(edge)])