def unified_test_dev_split(inf, ingoldf, keyin, goldkeyin, outf, keyout): gold_sent_iter = peekable(iter_sentences(ingoldf)) rm_inst_ids = [] def sent_rm_gold(sent): gold_sent = gold_sent_iter.peek(None) if gold_sent is not None and gold_sent.attrib["id"] == sent.attrib[ "id"]: for instance in sent.xpath("./instance"): rm_inst_ids.append(instance.attrib["id"]) next(gold_sent_iter) return BYPASS transform_sentences(inf, sent_rm_gold, outf) def next_rm(): try: return rm_inst_ids.pop(0) except IndexError: return None rm_id = next_rm() for line in keyin: if rm_id == line.split()[0]: rm_id = next_rm() continue keyout.write(line) assert len(rm_inst_ids) == 0 and rm_id is None
def char_span_dom(inf, outf): """ Dominance filter: When one annotation's single-token anchor spans (contains) another, keep the one with the longest character span. When there is a partial overlap (none dominates), proceed greedily. """ def sent_span_dom(sent): anns = sent.xpath("./annotations/annotation") starts = [] for ann in anns: anchor_pos = get_ann_pos_dict(ann) anchor = ann.attrib["anchor"] starts.append(int(anchor_pos["char"])) starts.sort() char_positions = {} for ann in anns: anchor_pos = get_ann_pos_dict(ann) anchor = ann.attrib["anchor"] cur_start = int(anchor_pos["char"]) cur_start_idx = starts.index(cur_start) anchor_len = len(anchor) span = 0 while (cur_start_idx + span < len(starts) and starts[cur_start_idx + span] <= cur_start + anchor_len): span += 1 char_positions.setdefault(cur_start, []).append((span, ann)) new_anns = greedy_max_span(char_positions) trim_anns(anns, new_anns) transform_sentences(inf, sent_span_dom, outf)
def rm_empty(inf, outf, text): """ Remove sentences with no annotations, or optionally with no text instead. """ def remove_empty(elem): if (len(elem.xpath("./text")) == 0 if text else len( elem.xpath("./annotations/annotation")) == 0): return BYPASS transform_sentences(inf, remove_empty, outf)
def eurosense_add_anchor_positions(inf: IO, outf: IO): def add_anchor_positions(sent_elem): for tok_cursor, cursor, _match_anchor, ann in iter_anchored_anns( sent_elem): if ann is None: continue ann.attrib[ "anchor-positions"] = f"token={tok_cursor}&char={cursor}" transform_sentences(inf, add_anchor_positions, outf)
def filter_lang(lang, inf, outf): """ Change a multilingual corpus to a monolingual one by selecting a single language. """ def remove_other_langs(elem): for ann in elem.xpath("./annotations/annotation | ./text"): if ann.attrib["lang"] == lang: continue ann.getparent().remove(ann) transform_sentences(inf, remove_other_langs, outf)
def head(inf, outf, sentences): """ Take the first SENTENCES sentences from INF. """ seen_sents = 0 def count_break_sent(sent): nonlocal seen_sents if seen_sents >= sentences: return BREAK seen_sents += 1 transform_sentences(inf, count_break_sent, outf) inf.close() outf.close()
def unified_split(inf: IO, outf: IO, keyout: IO): """ Split a keyfile out of a variant of the unified format which includes sense keys inline. """ def sent_split_key(sent_elem): sent_id = sent_elem.attrib["id"] for idx, inst in enumerate(sent_elem.xpath("instance")): key = inst.attrib["key"] del inst.attrib["key"] key_id = "{}.{:08d}".format(sent_id, idx) inst.attrib["id"] = key_id keyout.write("{} {}\n".format(key_id, key)) transform_sentences(inf, sent_split_key, outf)
def sample(inf, outf): """ Sample the sentences in DEFAULT_SAMPLE_LINES (fixed) from inf """ seen_sents = 0 def count_break_sent(sent): nonlocal seen_sents if seen_sents >= DEFAULT_SAMPLE_MAX: return BREAK if seen_sents not in DEFAULT_SAMPLE_LINES: seen_sents += 1 return BYPASS seen_sents += 1 transform_sentences(inf, count_break_sent, outf) if seen_sents <= max(DEFAULT_SAMPLE_LINES): print("Not enough sentences in input to sample.")
def rm_ambg(inf, outf): """ Remove ambiguous annotations of the same span. """ def sent_rm_ambg(sent): anns = sent.xpath("./annotations/annotation") new_anns = anns.copy() span_counts = {} for ann in anns: span = get_ann_pos(ann) if span not in span_counts: span_counts[span] = 0 span_counts[span] += 1 for ann in anns: span = get_ann_pos(ann) if span_counts[span] >= 2: new_anns.remove(ann) trim_anns(anns, new_anns) transform_sentences(inf, sent_rm_ambg, outf)
def fold_support(lang, inf, outf): """ Move information about how an annotation is connected to a wordnet how it is anchored into annotations which it supports in LANG. """ def tran(elem): xpath = "./annotations/annotation[@lang='{}']".format(lang) for ann in elem.xpath(xpath): support = ann.attrib.get("support") if not support: continue new_support = [] for supp in support.split(" "): supp = parse_qs_single(supp) trans_from = supp["transfer-from"] from_elem = elem.xpath( "./annotations/annotation[@id='{}']".format(trans_from))[0] from_wordnets = from_elem.attrib["wordnets"] anchor_positions = from_elem.attrib["anchor-positions"] for position in anchor_positions.split(" "): from_anchor = parse_qs_single(position) from_source = from_anchor["from-id"] from_lemma_path = from_elem.attrib["lemma-path"] from_anchor_char_length = len(from_elem.attrib["anchor"]) del supp["transfer-from"] supp.update({ "transfer-from-wordnets": from_wordnets, "transfer-from-source": from_source, "transfer-from-lemma-path": from_lemma_path, "transfer-from-anchor-positions": anchor_positions, "transfer-from-anchor-char-length": from_anchor_char_length, }) new_support.append(urlencode(supp)) ann.attrib["support"] = " ".join(new_support) transform_sentences(inf, tran, outf)
def finnpos_rm_pos(inf, outf, level): """ Heuristic POS removal: Remove specific POSs altogether. Most commonly PRONOUN, since this POS never exists in WordNet. """ def m(feat, val): def inner(feats): return feat in feats and feats[feat] == val return inner to_remove = [m("pos", "PRONOUN")] if level in ("normal", "agg"): to_remove.extend(( m("pos", "NUMERAL"), m("pos", "INTERJECTION"), m("pos", "CONJUNCTION"), m("pos", "PARTICLE"), m("pos", "PUNCTUATION"), m("proper", "PROPER"), )) if level == "agg": to_remove.append(m("pos", "ADPOSITION")) def sent_rm_pos(sent): finnpos_analys = get_finnpos_analys(sent) anns = sent.xpath("./annotations/annotation") new_anns = anns.copy() for ann in anns: tok, tok_len = get_ann_pos(ann) if tok_len != 1: continue props = finnpos_analys[tok][1] if any((match(props) for match in to_remove)): new_anns.remove(ann) trim_anns(anns, new_anns) transform_sentences(inf, sent_rm_pos, outf)
def tok_span_dom(inf, outf, sup_only=False): """ Dominance filter: When one annotation's multi-token anchor spans (contains) another, keep the one with the longest token span. When there is a partial overlap (none dominates), proceed greedily. """ def sent_span_dom(sent): anns = sent.xpath("./annotations/annotation") token_positions = {} for ann in anns: if sup_only and not HasSupportTournament.rank(ann): continue tok, tok_len = get_ann_pos(ann) token_positions.setdefault(tok, []).append((tok_len, ann)) new_anns = greedy_max_span(token_positions) if sup_only: for ann in anns: if not HasSupportTournament.rank(ann): new_anns.append(ann) trim_anns(anns, new_anns) transform_sentences(inf, sent_span_dom, outf)
def proc_stream(self, inf, outf): return transform_sentences(inf, self.proc_sent, outf)