Ejemplo n.º 1
0
def vec_clust_autoextend_graph(lemma_name, pos, return_centers=False):
    fiwn_space = vecs.get_vecs()
    labels = []
    mat = []
    for synset_id, lemma_objs in get_lemma_objs(lemma_name, WORDNETS,
                                                pos).items():
        labels.append(pre_id_to_post(synset_id))
        lemma_obj = None
        for wn, l in lemma_objs:
            if wn == "qf2":
                lemma_obj = l
        if lemma_obj is None:
            mat.append(None)
            continue
        lemma_id = get_lemma_id(lemma_obj)
        try:
            vec = fiwn_space[lemma_id]
        except KeyError:
            mat.append(None)
            continue
        mat.append(vec)
    if not labels:
        return unclusterable_default(labels, return_centers=return_centers)
    affinities = cos_affinities_none(mat)
    return graph_clust_grouped(affinities, labels, return_centers)
Ejemplo n.º 2
0
def get_wordnet_defns(
    lemma_name,
    pos,
    skip_empty=True,
    tokenize=True,
    include_enss=False,
    lower=False,
):
    for synset_id, lemma_objs in get_lemma_objs(lemma_name, WORDNETS,
                                                pos).items():
        assert len(lemma_objs) >= 1
        tokens = lemma_objs[0][1].synset().definition().strip()
        if skip_empty and not tokens:
            sys.stderr.write(f"Empty defn: {lemma_name}.{pos}: {synset_id}'\n")
            continue
        if tokenize:
            tokens = word_tokenize(tokens)
        if include_enss:
            assert tokenize
            ss = en_synset(lemma_objs)
            for lemma in ss.lemmas():
                for bit in lemma.name().split("_"):
                    tokens.append(bit)
        if lower:
            assert tokenize
            tokens = [token.lower() for token in tokens]
        yield pre_id_to_post(synset_id), tokens
Ejemplo n.º 3
0
def gen(words, out_dir):
    """
    Generate unclustered words in OUT_DIR from word list WORDS
    """
    session = get_session()
    for word in words:
        word_pos = word.split("#")[0].strip()
        word, pos = word_pos.split(".")
        assert pos == "Noun"
        with open(pjoin(out_dir, word_pos), "w") as outf:
            # Get Wiktionary results
            results = session.execute(select([
                word_sense.c.sense_id,
                word_sense.c.etymology_index,
                word_sense.c.sense,
                word_sense.c.extra,
            ]).select_from(joined).where(
                (headword.c.name == word) &
                (word_sense.c.pos == "Noun")
            ).order_by(word_sense.c.etymology_index)).fetchall()
            prev_ety = None
            for row in results:
                if prev_ety is not None and row["etymology_index"] != prev_ety:
                    outf.write("\n")
                outf.write("{} # {}\n".format(row["sense_id"], row["extra"]["raw_defn"].strip().replace("\n", " --- ")))
                prev_ety = row["etymology_index"]

            # Get WordNet results
            for synset_id, lemma_objs in get_lemma_objs(word, WORDNETS, "n").items():
                wordnets = {wn for wn, _ in lemma_objs}
                outf.write("\n")
                outf.write("{} # [{}] {}\n".format(pre_id_to_post(synset_id), ", ".join(wordnets), annotation_comment(lemma_objs)))
Ejemplo n.º 4
0
        def ann2ss(ann):
            from stiff.munge.utils import synset_id_of_ann
            from nltk.corpus import wordnet
            from finntk.wordnet.utils import pre_id_to_post

            synset_id = pre_id_to_post(synset_id_of_ann(ann))
            # TODO: proper handling of new FinnWordNet synsets
            if synset_id[0] == "9":
                return
            return wordnet.of2ss(synset_id)
Ejemplo n.º 5
0
def write_lemma(keyout, inst_id, lemma):
    fi2en, en2fi = get_en_fi_maps()
    if lemma is None:
        guess = "U"
    else:
        chosen_synset_fi_id = ss2pre(lemma.synset())
        if chosen_synset_fi_id not in fi2en:
            sys.stderr.write("No fi2en mapping found for {} ({})\n".format(
                chosen_synset_fi_id, lemma))
            guess = "U"
        else:
            guess = pre_id_to_post(fi2en[chosen_synset_fi_id])
    keyout.write("{} {}\n".format(inst_id, guess))
Ejemplo n.º 6
0
 def l2ss(ann):
     ann.text = pre_id_to_post(synset_id_of_ann(ann))