def extract_spans(ann): for support_qs in ann.attrib["support"].split(" "): support_dict = parse_qs_single(support_qs) positions = parse_qs_single( support_dict["transfer-from-anchor-positions"]) yield int(positions["char"]), int( support_dict["transfer-from-anchor-char-length"])
def tran(elem): xpath = "./annotations/annotation[@lang='{}']".format(lang) for ann in elem.xpath(xpath): support = ann.attrib.get("support") if not support: continue new_support = [] for supp in support.split(" "): supp = parse_qs_single(supp) trans_from = supp["transfer-from"] from_elem = elem.xpath( "./annotations/annotation[@id='{}']".format(trans_from))[0] from_wordnets = from_elem.attrib["wordnets"] anchor_positions = from_elem.attrib["anchor-positions"] for position in anchor_positions.split(" "): from_anchor = parse_qs_single(position) from_source = from_anchor["from-id"] from_lemma_path = from_elem.attrib["lemma-path"] from_anchor_char_length = len(from_elem.attrib["anchor"]) del supp["transfer-from"] supp.update({ "transfer-from-wordnets": from_wordnets, "transfer-from-source": from_source, "transfer-from-lemma-path": from_lemma_path, "transfer-from-anchor-positions": anchor_positions, "transfer-from-anchor-char-length": from_anchor_char_length, }) new_support.append(urlencode(supp)) ann.attrib["support"] = " ".join(new_support)
def select_wn(ann): # annotation[wordnets] ann_wns = ann.attrib["wordnets"].split() common_wns = filter_wns(ann_wns) if not len(common_wns): return BYPASS ann.attrib["wordnets"] = " ".join(common_wns) # annotation[wnlemma] wnlemma_bits = ann.attrib["wnlemma"].split(" ") new_wmlemmas_bits = [] for wnlemma in wnlemma_bits: wnlemma_dict = parse_qs_single(wnlemma) wnlemma_wns = wnlemma_dict["wn"].split(",") common_wns = filter_wns(wnlemma_wns) if not common_wns: continue wnlemma_dict["wn"] = ",".join(common_wns) new_wmlemmas_bits.append(urlencode(wnlemma_dict)) ann.attrib["wnlemma"] = " ".join(new_wmlemmas_bits) # annotation > #text ann_langs = langs_of_wns(ann_wns) if len(ann_langs) <= len(selected_langs): return lemmas_str = ann.text bits = lemmas_str.split(" ") assert len(bits) <= 2 if len(bits) <= 1: return if "eng" in selected_langs: ann.text = bits[0] else: ann.text = bits[1]
def rank(ann): if "support" not in ann.attrib: return 0 transfer_from = set() for support_qs in ann.attrib["support"].split(" "): support = parse_qs_single(support_qs) transfer_from = support["transfer-from-wordnets"] transfer_from = set(transfer_from.split("+")) return 1 if (transfer_from - {"qwc"}) else 0
def rank(ann): if "support" not in ann.attrib: return 0 have_aligned = False for support_qs in ann.attrib["support"].split(" "): support = parse_qs_single(support_qs) if support["transfer-type"] == "aligned": have_aligned = True return 1 if have_aligned else 0
def rank(ann): if "support" not in ann.attrib: return 0 max_len = 0 for support_qs in ann.attrib["support"].split(" "): support = parse_qs_single(support_qs) cur_len = int(support["transfer-from-anchor-char-length"]) if cur_len > max_len: max_len = cur_len return max_len
def rank(ann): if "support" not in ann.attrib: return 0 has_non_deriv = False for support_qs in ann.attrib["support"].split(" "): support = parse_qs_single(support_qs) if "transform-chain" not in support: return 0 # XXX: Should be json transform_chain = ast.literal_eval(support["transform-chain"]) if "deriv" not in transform_chain: has_non_deriv = True return 1 if has_non_deriv else 0
def rank(ann, finnpos_analys): if "wnlemma" not in ann.attrib or not ann.attrib["wnlemma"]: return 0 tok, tok_len = get_ann_pos(ann) head_off = get_headword_offset(ann) finnpos_head, feats = finnpos_analys[tok + head_off] any_match = False for lemma_bit in ann.attrib["wnlemma"].split(" "): lemma_dict = parse_qs_single(lemma_bit) lemma = lemma_dict["l"] wn_head = lemma.split("_")[head_off] if finnpos_head == wn_head: any_match = True return 1 if any_match else 0
def get_lemma(ann): best_lemma = None best_lemma_goodness = -2 assert ann.attrib["wnlemma"] for idx, lemma_bit in enumerate(ann.attrib["wnlemma"].split(" ")): lemma_dict = parse_qs_single(lemma_bit) lemma = lemma_dict["l"] wn_lemma_surfed = wnlemma_to_analy_lemma(lemma) goodness = (2 if wn_lemma_surfed == ann.attrib["lemma"] else ( 1 if wn_lemma_surfed == ann.attrib["anchor"].lower() else -idx)) if goodness > best_lemma_goodness: best_lemma = lemma best_lemma_goodness = goodness assert best_lemma is not None return best_lemma
def overlap_examples(inf): for sent in iter_sentences(inf): tok_lems = sent.xpath("./text[@id='zh-tok']")[0].text.split(" ") untok_lems = set() for ann in sent.xpath("./annotations/annotation[@lang='zh']"): anchor_positions = ann.attrib["anchor-positions"] for position in anchor_positions.split(" "): anchor = parse_qs_single(position) source = anchor["from-id"] if source == "zh-untok": untok_lems.add(ann.attrib["lemma"]) for untok_lem in untok_lems: if not (any(untok_lem in tok_lem for tok_lem in tok_lems)): print("Not a substring:", untok_lem) for text in sent.xpath("./text"): print(text.text)
def key_ann(ann): from stiff.munge.utils import synset_id_of_ann token_idx = int(parse_qs_single(ann.attrib["anchor-positions"])["token"]) return (token_idx, ann.attrib["anchor"], synset_id_of_ann(ann))
def stiff_to_unified(stiff: IO, unified: IO, input_fmt: str): """ Do the XML conversion from the STIFF format (similar to the Eurosense format) to the Unified format. Note that this assumes is that previous filtering has produced an unambiguous tagging. """ write_header(unified, "eurosense" if input_fmt == "man-ann-europarl" else "stiff") if input_fmt == "man-ann-stiff": sent_iter = iter_sentences_opensubs18_man_ann(stiff) elif input_fmt == "stiff": sent_iter = opensubs18_ids_to_unified(iter_sentences_opensubs18(stiff)) else: assert input_fmt == "man-ann-europarl" sent_iter = iter_sentences_eurosense(stiff) for sent_id, sent_elem in sent_iter: unified.write('<sentence id="{}">\n'.format(sent_id)) text_elem = sent_elem.xpath("text")[0] text_id = text_elem.attrib.get("id") anns = [] for ann in sent_elem.xpath(".//annotation"): our_pos = None for pos_enc in ann.attrib["anchor-positions"].split(" "): pos = parse_qs_single(pos_enc) if text_id is None or pos["from-id"] == text_id: our_pos = pos assert our_pos is not None, "Didn't find a usable anchor position" char_id = int(our_pos["char"]) anns.append( (char_id, ann.attrib["anchor"], get_lemma(ann), ann.text)) anns.sort() sent = text_elem.text cursor = 0 while cursor < len(sent): instance = None while 1: if not len(anns): break char_id, anchor, lemma, ann = anns[0] assert ( char_id >= cursor ), "Moved past anchor position - can't have overlapping anchors" if char_id > cursor: # Try again to move past leading punctation which has been # put in the same token like: `-ajoneuvo` with anchor # ajoneuvo # XXX: This approach just deletes the leading punctation. # Probably not what is wanted but servicable for the time # being. old_cursor = cursor while not (sent[cursor].isalnum() or sent[cursor].isspace()) and cursor < min( char_id, len(sent)): cursor += 1 if cursor != char_id: # Reset cursor = old_cursor break if instance is None: instance = {"lemma": lemma, "anchor": anchor, "key": []} else: assert (instance["lemma"] == lemma ), "Can't tag an instance with multiple lemmas" assert ( instance["anchor"] == anchor ), "Can't have different anchors at different positions" instance["key"].append(ann) del anns[0] if instance is not None: pos = WN_UNI_POS_MAP[instance["key"][-1][-1]] unified.write( '<instance lemma="{}" key="{}" pos="{}">{}</instance>\n'. format( instance["lemma"], " ".join(instance["key"]), pos, instance["anchor"], )) # XXX: This approach just deletes the trailing punctation. # Probably not what is wanted but servicable for the time # being. Old code: # cursor += len(instance["anchor"]) + 1 end_pos = sent.find(" ", cursor) if end_pos == -1: break cursor = end_pos + 1 else: end_pos = sent.find(" ", cursor) if end_pos == -1: end_pos = None unified.write("<wf>{}</wf>\n".format( escape(sent[cursor:end_pos]))) if end_pos is None: break cursor = end_pos + 1 unified.write("</sentence>") unified.write("</text>\n") unified.write("</corpus>\n")
def get_ann_pos_dict(ann): anchor_poses = ann.attrib["anchor-positions"].split() assert len(anchor_poses) == 1 anchor_pos_str = anchor_poses[0] return parse_qs_single(anchor_pos_str)