Ejemplo n.º 1
0
def unified_test_dev_split(inf, ingoldf, keyin, goldkeyin, outf, keyout):
    gold_sent_iter = peekable(iter_sentences(ingoldf))
    rm_inst_ids = []

    def sent_rm_gold(sent):
        gold_sent = gold_sent_iter.peek(None)
        if gold_sent is not None and gold_sent.attrib["id"] == sent.attrib[
                "id"]:
            for instance in sent.xpath("./instance"):
                rm_inst_ids.append(instance.attrib["id"])
            next(gold_sent_iter)
            return BYPASS

    transform_sentences(inf, sent_rm_gold, outf)

    def next_rm():
        try:
            return rm_inst_ids.pop(0)
        except IndexError:
            return None

    rm_id = next_rm()
    for line in keyin:
        if rm_id == line.split()[0]:
            rm_id = next_rm()
            continue
        keyout.write(line)

    assert len(rm_inst_ids) == 0 and rm_id is None
Ejemplo n.º 2
0
def char_span_dom(inf, outf):
    """
    Dominance filter:

    When one annotation's single-token anchor spans (contains) another, keep
    the one with the longest character span. When there is a partial overlap
    (none dominates), proceed greedily.
    """
    def sent_span_dom(sent):
        anns = sent.xpath("./annotations/annotation")
        starts = []
        for ann in anns:
            anchor_pos = get_ann_pos_dict(ann)
            anchor = ann.attrib["anchor"]
            starts.append(int(anchor_pos["char"]))
        starts.sort()
        char_positions = {}
        for ann in anns:
            anchor_pos = get_ann_pos_dict(ann)
            anchor = ann.attrib["anchor"]
            cur_start = int(anchor_pos["char"])
            cur_start_idx = starts.index(cur_start)
            anchor_len = len(anchor)
            span = 0
            while (cur_start_idx + span < len(starts)
                   and starts[cur_start_idx + span] <= cur_start + anchor_len):
                span += 1
            char_positions.setdefault(cur_start, []).append((span, ann))
        new_anns = greedy_max_span(char_positions)
        trim_anns(anns, new_anns)

    transform_sentences(inf, sent_span_dom, outf)
Ejemplo n.º 3
0
def rm_empty(inf, outf, text):
    """
    Remove sentences with no annotations, or optionally with no text instead.
    """
    def remove_empty(elem):
        if (len(elem.xpath("./text")) == 0 if text else len(
                elem.xpath("./annotations/annotation")) == 0):
            return BYPASS

    transform_sentences(inf, remove_empty, outf)
Ejemplo n.º 4
0
def eurosense_add_anchor_positions(inf: IO, outf: IO):
    def add_anchor_positions(sent_elem):
        for tok_cursor, cursor, _match_anchor, ann in iter_anchored_anns(
                sent_elem):
            if ann is None:
                continue
            ann.attrib[
                "anchor-positions"] = f"token={tok_cursor}&char={cursor}"

    transform_sentences(inf, add_anchor_positions, outf)
Ejemplo n.º 5
0
def filter_lang(lang, inf, outf):
    """
    Change a multilingual corpus to a monolingual one by selecting a single
    language.
    """
    def remove_other_langs(elem):
        for ann in elem.xpath("./annotations/annotation | ./text"):
            if ann.attrib["lang"] == lang:
                continue
            ann.getparent().remove(ann)

    transform_sentences(inf, remove_other_langs, outf)
Ejemplo n.º 6
0
def head(inf, outf, sentences):
    """
    Take the first SENTENCES sentences from INF.
    """
    seen_sents = 0

    def count_break_sent(sent):
        nonlocal seen_sents
        if seen_sents >= sentences:
            return BREAK
        seen_sents += 1

    transform_sentences(inf, count_break_sent, outf)
    inf.close()
    outf.close()
Ejemplo n.º 7
0
def unified_split(inf: IO, outf: IO, keyout: IO):
    """
    Split a keyfile out of a variant of the unified format which includes sense
    keys inline.
    """
    def sent_split_key(sent_elem):
        sent_id = sent_elem.attrib["id"]
        for idx, inst in enumerate(sent_elem.xpath("instance")):
            key = inst.attrib["key"]
            del inst.attrib["key"]
            key_id = "{}.{:08d}".format(sent_id, idx)
            inst.attrib["id"] = key_id
            keyout.write("{} {}\n".format(key_id, key))

    transform_sentences(inf, sent_split_key, outf)
Ejemplo n.º 8
0
def sample(inf, outf):
    """
    Sample the sentences in DEFAULT_SAMPLE_LINES (fixed) from inf
    """
    seen_sents = 0

    def count_break_sent(sent):
        nonlocal seen_sents
        if seen_sents >= DEFAULT_SAMPLE_MAX:
            return BREAK
        if seen_sents not in DEFAULT_SAMPLE_LINES:
            seen_sents += 1
            return BYPASS
        seen_sents += 1

    transform_sentences(inf, count_break_sent, outf)

    if seen_sents <= max(DEFAULT_SAMPLE_LINES):
        print("Not enough sentences in input to sample.")
Ejemplo n.º 9
0
def rm_ambg(inf, outf):
    """
    Remove ambiguous annotations of the same span.
    """
    def sent_rm_ambg(sent):
        anns = sent.xpath("./annotations/annotation")
        new_anns = anns.copy()
        span_counts = {}
        for ann in anns:
            span = get_ann_pos(ann)
            if span not in span_counts:
                span_counts[span] = 0
            span_counts[span] += 1
        for ann in anns:
            span = get_ann_pos(ann)
            if span_counts[span] >= 2:
                new_anns.remove(ann)
        trim_anns(anns, new_anns)

    transform_sentences(inf, sent_rm_ambg, outf)
Ejemplo n.º 10
0
def fold_support(lang, inf, outf):
    """
    Move information about how an annotation is connected to a wordnet how it
    is anchored into annotations which it supports in LANG.
    """
    def tran(elem):
        xpath = "./annotations/annotation[@lang='{}']".format(lang)
        for ann in elem.xpath(xpath):
            support = ann.attrib.get("support")
            if not support:
                continue
            new_support = []
            for supp in support.split(" "):
                supp = parse_qs_single(supp)
                trans_from = supp["transfer-from"]
                from_elem = elem.xpath(
                    "./annotations/annotation[@id='{}']".format(trans_from))[0]
                from_wordnets = from_elem.attrib["wordnets"]
                anchor_positions = from_elem.attrib["anchor-positions"]
                for position in anchor_positions.split(" "):
                    from_anchor = parse_qs_single(position)
                    from_source = from_anchor["from-id"]
                from_lemma_path = from_elem.attrib["lemma-path"]
                from_anchor_char_length = len(from_elem.attrib["anchor"])
                del supp["transfer-from"]
                supp.update({
                    "transfer-from-wordnets":
                    from_wordnets,
                    "transfer-from-source":
                    from_source,
                    "transfer-from-lemma-path":
                    from_lemma_path,
                    "transfer-from-anchor-positions":
                    anchor_positions,
                    "transfer-from-anchor-char-length":
                    from_anchor_char_length,
                })
                new_support.append(urlencode(supp))
            ann.attrib["support"] = " ".join(new_support)

    transform_sentences(inf, tran, outf)
Ejemplo n.º 11
0
def finnpos_rm_pos(inf, outf, level):
    """
    Heuristic POS removal: Remove specific POSs altogether. Most commonly
    PRONOUN, since this POS never exists in WordNet.
    """
    def m(feat, val):
        def inner(feats):
            return feat in feats and feats[feat] == val

        return inner

    to_remove = [m("pos", "PRONOUN")]
    if level in ("normal", "agg"):
        to_remove.extend((
            m("pos", "NUMERAL"),
            m("pos", "INTERJECTION"),
            m("pos", "CONJUNCTION"),
            m("pos", "PARTICLE"),
            m("pos", "PUNCTUATION"),
            m("proper", "PROPER"),
        ))
    if level == "agg":
        to_remove.append(m("pos", "ADPOSITION"))

    def sent_rm_pos(sent):
        finnpos_analys = get_finnpos_analys(sent)
        anns = sent.xpath("./annotations/annotation")
        new_anns = anns.copy()
        for ann in anns:
            tok, tok_len = get_ann_pos(ann)
            if tok_len != 1:
                continue
            props = finnpos_analys[tok][1]
            if any((match(props) for match in to_remove)):
                new_anns.remove(ann)
        trim_anns(anns, new_anns)

    transform_sentences(inf, sent_rm_pos, outf)
Ejemplo n.º 12
0
def tok_span_dom(inf, outf, sup_only=False):
    """
    Dominance filter:

    When one annotation's multi-token anchor spans (contains) another, keep the
    one with the longest token span. When there is a partial overlap (none
    dominates), proceed greedily.
    """
    def sent_span_dom(sent):
        anns = sent.xpath("./annotations/annotation")
        token_positions = {}
        for ann in anns:
            if sup_only and not HasSupportTournament.rank(ann):
                continue
            tok, tok_len = get_ann_pos(ann)
            token_positions.setdefault(tok, []).append((tok_len, ann))
        new_anns = greedy_max_span(token_positions)
        if sup_only:
            for ann in anns:
                if not HasSupportTournament.rank(ann):
                    new_anns.append(ann)
        trim_anns(anns, new_anns)

    transform_sentences(inf, sent_span_dom, outf)
Ejemplo n.º 13
0
 def proc_stream(self, inf, outf):
     return transform_sentences(inf, self.proc_sent, outf)