Ejemplo n.º 1
0
 def extract_spans(ann):
     for support_qs in ann.attrib["support"].split(" "):
         support_dict = parse_qs_single(support_qs)
         positions = parse_qs_single(
             support_dict["transfer-from-anchor-positions"])
         yield int(positions["char"]), int(
             support_dict["transfer-from-anchor-char-length"])
Ejemplo n.º 2
0
 def tran(elem):
     xpath = "./annotations/annotation[@lang='{}']".format(lang)
     for ann in elem.xpath(xpath):
         support = ann.attrib.get("support")
         if not support:
             continue
         new_support = []
         for supp in support.split(" "):
             supp = parse_qs_single(supp)
             trans_from = supp["transfer-from"]
             from_elem = elem.xpath(
                 "./annotations/annotation[@id='{}']".format(trans_from))[0]
             from_wordnets = from_elem.attrib["wordnets"]
             anchor_positions = from_elem.attrib["anchor-positions"]
             for position in anchor_positions.split(" "):
                 from_anchor = parse_qs_single(position)
                 from_source = from_anchor["from-id"]
             from_lemma_path = from_elem.attrib["lemma-path"]
             from_anchor_char_length = len(from_elem.attrib["anchor"])
             del supp["transfer-from"]
             supp.update({
                 "transfer-from-wordnets":
                 from_wordnets,
                 "transfer-from-source":
                 from_source,
                 "transfer-from-lemma-path":
                 from_lemma_path,
                 "transfer-from-anchor-positions":
                 anchor_positions,
                 "transfer-from-anchor-char-length":
                 from_anchor_char_length,
             })
             new_support.append(urlencode(supp))
         ann.attrib["support"] = " ".join(new_support)
Ejemplo n.º 3
0
    def select_wn(ann):
        # annotation[wordnets]
        ann_wns = ann.attrib["wordnets"].split()
        common_wns = filter_wns(ann_wns)
        if not len(common_wns):
            return BYPASS
        ann.attrib["wordnets"] = " ".join(common_wns)

        # annotation[wnlemma]
        wnlemma_bits = ann.attrib["wnlemma"].split(" ")
        new_wmlemmas_bits = []
        for wnlemma in wnlemma_bits:
            wnlemma_dict = parse_qs_single(wnlemma)
            wnlemma_wns = wnlemma_dict["wn"].split(",")
            common_wns = filter_wns(wnlemma_wns)
            if not common_wns:
                continue
            wnlemma_dict["wn"] = ",".join(common_wns)
            new_wmlemmas_bits.append(urlencode(wnlemma_dict))
        ann.attrib["wnlemma"] = " ".join(new_wmlemmas_bits)

        # annotation > #text
        ann_langs = langs_of_wns(ann_wns)
        if len(ann_langs) <= len(selected_langs):
            return
        lemmas_str = ann.text
        bits = lemmas_str.split(" ")
        assert len(bits) <= 2
        if len(bits) <= 1:
            return
        if "eng" in selected_langs:
            ann.text = bits[0]
        else:
            ann.text = bits[1]
Ejemplo n.º 4
0
 def rank(ann):
     if "support" not in ann.attrib:
         return 0
     transfer_from = set()
     for support_qs in ann.attrib["support"].split(" "):
         support = parse_qs_single(support_qs)
         transfer_from = support["transfer-from-wordnets"]
         transfer_from = set(transfer_from.split("+"))
     return 1 if (transfer_from - {"qwc"}) else 0
Ejemplo n.º 5
0
 def rank(ann):
     if "support" not in ann.attrib:
         return 0
     have_aligned = False
     for support_qs in ann.attrib["support"].split(" "):
         support = parse_qs_single(support_qs)
         if support["transfer-type"] == "aligned":
             have_aligned = True
     return 1 if have_aligned else 0
Ejemplo n.º 6
0
 def rank(ann):
     if "support" not in ann.attrib:
         return 0
     max_len = 0
     for support_qs in ann.attrib["support"].split(" "):
         support = parse_qs_single(support_qs)
         cur_len = int(support["transfer-from-anchor-char-length"])
         if cur_len > max_len:
             max_len = cur_len
     return max_len
Ejemplo n.º 7
0
 def rank(ann):
     if "support" not in ann.attrib:
         return 0
     has_non_deriv = False
     for support_qs in ann.attrib["support"].split(" "):
         support = parse_qs_single(support_qs)
         if "transform-chain" not in support:
             return 0
         # XXX: Should be json
         transform_chain = ast.literal_eval(support["transform-chain"])
         if "deriv" not in transform_chain:
             has_non_deriv = True
     return 1 if has_non_deriv else 0
Ejemplo n.º 8
0
 def rank(ann, finnpos_analys):
     if "wnlemma" not in ann.attrib or not ann.attrib["wnlemma"]:
         return 0
     tok, tok_len = get_ann_pos(ann)
     head_off = get_headword_offset(ann)
     finnpos_head, feats = finnpos_analys[tok + head_off]
     any_match = False
     for lemma_bit in ann.attrib["wnlemma"].split(" "):
         lemma_dict = parse_qs_single(lemma_bit)
         lemma = lemma_dict["l"]
         wn_head = lemma.split("_")[head_off]
         if finnpos_head == wn_head:
             any_match = True
     return 1 if any_match else 0
Ejemplo n.º 9
0
def get_lemma(ann):
    best_lemma = None
    best_lemma_goodness = -2
    assert ann.attrib["wnlemma"]
    for idx, lemma_bit in enumerate(ann.attrib["wnlemma"].split(" ")):
        lemma_dict = parse_qs_single(lemma_bit)
        lemma = lemma_dict["l"]
        wn_lemma_surfed = wnlemma_to_analy_lemma(lemma)
        goodness = (2 if wn_lemma_surfed == ann.attrib["lemma"] else (
            1 if wn_lemma_surfed == ann.attrib["anchor"].lower() else -idx))
        if goodness > best_lemma_goodness:
            best_lemma = lemma
            best_lemma_goodness = goodness
    assert best_lemma is not None
    return best_lemma
Ejemplo n.º 10
0
def overlap_examples(inf):
    for sent in iter_sentences(inf):
        tok_lems = sent.xpath("./text[@id='zh-tok']")[0].text.split(" ")
        untok_lems = set()
        for ann in sent.xpath("./annotations/annotation[@lang='zh']"):
            anchor_positions = ann.attrib["anchor-positions"]
            for position in anchor_positions.split(" "):
                anchor = parse_qs_single(position)
                source = anchor["from-id"]
                if source == "zh-untok":
                    untok_lems.add(ann.attrib["lemma"])
        for untok_lem in untok_lems:
            if not (any(untok_lem in tok_lem for tok_lem in tok_lems)):
                print("Not a substring:", untok_lem)
                for text in sent.xpath("./text"):
                    print(text.text)
Ejemplo n.º 11
0
def key_ann(ann):
    from stiff.munge.utils import synset_id_of_ann

    token_idx = int(parse_qs_single(ann.attrib["anchor-positions"])["token"])
    return (token_idx, ann.attrib["anchor"], synset_id_of_ann(ann))
Ejemplo n.º 12
0
def stiff_to_unified(stiff: IO, unified: IO, input_fmt: str):
    """
    Do the XML conversion from the STIFF format (similar to the Eurosense
    format) to the Unified format. Note that this assumes is that previous
    filtering has produced an unambiguous tagging.
    """
    write_header(unified,
                 "eurosense" if input_fmt == "man-ann-europarl" else "stiff")
    if input_fmt == "man-ann-stiff":
        sent_iter = iter_sentences_opensubs18_man_ann(stiff)
    elif input_fmt == "stiff":
        sent_iter = opensubs18_ids_to_unified(iter_sentences_opensubs18(stiff))
    else:
        assert input_fmt == "man-ann-europarl"
        sent_iter = iter_sentences_eurosense(stiff)
    for sent_id, sent_elem in sent_iter:
        unified.write('<sentence id="{}">\n'.format(sent_id))
        text_elem = sent_elem.xpath("text")[0]
        text_id = text_elem.attrib.get("id")
        anns = []
        for ann in sent_elem.xpath(".//annotation"):
            our_pos = None
            for pos_enc in ann.attrib["anchor-positions"].split(" "):
                pos = parse_qs_single(pos_enc)
                if text_id is None or pos["from-id"] == text_id:
                    our_pos = pos
            assert our_pos is not None, "Didn't find a usable anchor position"
            char_id = int(our_pos["char"])
            anns.append(
                (char_id, ann.attrib["anchor"], get_lemma(ann), ann.text))
        anns.sort()
        sent = text_elem.text
        cursor = 0
        while cursor < len(sent):
            instance = None
            while 1:
                if not len(anns):
                    break
                char_id, anchor, lemma, ann = anns[0]
                assert (
                    char_id >= cursor
                ), "Moved past anchor position - can't have overlapping anchors"
                if char_id > cursor:
                    # Try again to move past leading punctation which has been
                    # put in the same token like: `-ajoneuvo` with anchor
                    # ajoneuvo

                    # XXX: This approach just deletes the leading punctation.
                    # Probably not what is wanted but servicable for the time
                    # being.
                    old_cursor = cursor
                    while not (sent[cursor].isalnum()
                               or sent[cursor].isspace()) and cursor < min(
                                   char_id, len(sent)):
                        cursor += 1
                    if cursor != char_id:
                        # Reset
                        cursor = old_cursor
                        break
                if instance is None:
                    instance = {"lemma": lemma, "anchor": anchor, "key": []}
                else:
                    assert (instance["lemma"] == lemma
                            ), "Can't tag an instance with multiple lemmas"
                    assert (
                        instance["anchor"] == anchor
                    ), "Can't have different anchors at different positions"
                instance["key"].append(ann)
                del anns[0]
            if instance is not None:
                pos = WN_UNI_POS_MAP[instance["key"][-1][-1]]
                unified.write(
                    '<instance lemma="{}" key="{}" pos="{}">{}</instance>\n'.
                    format(
                        instance["lemma"],
                        " ".join(instance["key"]),
                        pos,
                        instance["anchor"],
                    ))
                # XXX: This approach just deletes the trailing punctation.
                # Probably not what is wanted but servicable for the time
                # being. Old code:
                # cursor += len(instance["anchor"]) + 1

                end_pos = sent.find(" ", cursor)
                if end_pos == -1:
                    break
                cursor = end_pos + 1
            else:
                end_pos = sent.find(" ", cursor)
                if end_pos == -1:
                    end_pos = None
                unified.write("<wf>{}</wf>\n".format(
                    escape(sent[cursor:end_pos])))
                if end_pos is None:
                    break
                cursor = end_pos + 1
        unified.write("</sentence>")

    unified.write("</text>\n")
    unified.write("</corpus>\n")
Ejemplo n.º 13
0
def get_ann_pos_dict(ann):
    anchor_poses = ann.attrib["anchor-positions"].split()
    assert len(anchor_poses) == 1
    anchor_pos_str = anchor_poses[0]
    return parse_qs_single(anchor_pos_str)