def vec_clust_autoextend_graph(lemma_name, pos, return_centers=False): fiwn_space = vecs.get_vecs() labels = [] mat = [] for synset_id, lemma_objs in get_lemma_objs(lemma_name, WORDNETS, pos).items(): labels.append(pre_id_to_post(synset_id)) lemma_obj = None for wn, l in lemma_objs: if wn == "qf2": lemma_obj = l if lemma_obj is None: mat.append(None) continue lemma_id = get_lemma_id(lemma_obj) try: vec = fiwn_space[lemma_id] except KeyError: mat.append(None) continue mat.append(vec) if not labels: return unclusterable_default(labels, return_centers=return_centers) affinities = cos_affinities_none(mat) return graph_clust_grouped(affinities, labels, return_centers)
def get_wordnet_defns( lemma_name, pos, skip_empty=True, tokenize=True, include_enss=False, lower=False, ): for synset_id, lemma_objs in get_lemma_objs(lemma_name, WORDNETS, pos).items(): assert len(lemma_objs) >= 1 tokens = lemma_objs[0][1].synset().definition().strip() if skip_empty and not tokens: sys.stderr.write(f"Empty defn: {lemma_name}.{pos}: {synset_id}'\n") continue if tokenize: tokens = word_tokenize(tokens) if include_enss: assert tokenize ss = en_synset(lemma_objs) for lemma in ss.lemmas(): for bit in lemma.name().split("_"): tokens.append(bit) if lower: assert tokenize tokens = [token.lower() for token in tokens] yield pre_id_to_post(synset_id), tokens
def gen(words, out_dir): """ Generate unclustered words in OUT_DIR from word list WORDS """ session = get_session() for word in words: word_pos = word.split("#")[0].strip() word, pos = word_pos.split(".") assert pos == "Noun" with open(pjoin(out_dir, word_pos), "w") as outf: # Get Wiktionary results results = session.execute(select([ word_sense.c.sense_id, word_sense.c.etymology_index, word_sense.c.sense, word_sense.c.extra, ]).select_from(joined).where( (headword.c.name == word) & (word_sense.c.pos == "Noun") ).order_by(word_sense.c.etymology_index)).fetchall() prev_ety = None for row in results: if prev_ety is not None and row["etymology_index"] != prev_ety: outf.write("\n") outf.write("{} # {}\n".format(row["sense_id"], row["extra"]["raw_defn"].strip().replace("\n", " --- "))) prev_ety = row["etymology_index"] # Get WordNet results for synset_id, lemma_objs in get_lemma_objs(word, WORDNETS, "n").items(): wordnets = {wn for wn, _ in lemma_objs} outf.write("\n") outf.write("{} # [{}] {}\n".format(pre_id_to_post(synset_id), ", ".join(wordnets), annotation_comment(lemma_objs)))
def ann2ss(ann): from stiff.munge.utils import synset_id_of_ann from nltk.corpus import wordnet from finntk.wordnet.utils import pre_id_to_post synset_id = pre_id_to_post(synset_id_of_ann(ann)) # TODO: proper handling of new FinnWordNet synsets if synset_id[0] == "9": return return wordnet.of2ss(synset_id)
def write_lemma(keyout, inst_id, lemma): fi2en, en2fi = get_en_fi_maps() if lemma is None: guess = "U" else: chosen_synset_fi_id = ss2pre(lemma.synset()) if chosen_synset_fi_id not in fi2en: sys.stderr.write("No fi2en mapping found for {} ({})\n".format( chosen_synset_fi_id, lemma)) guess = "U" else: guess = pre_id_to_post(fi2en[chosen_synset_fi_id]) keyout.write("{} {}\n".format(inst_id, guess))
def l2ss(ann): ann.text = pre_id_to_post(synset_id_of_ann(ann))