def token_distance(token: str, other: str, metrics: Collection[Distance] = {Distance.NAIVE}) -> float: distance = 0.0 token_lemma = lemmatize(token) other_lemma = lemmatize(other) if Distance.POS in metrics: token_pos = pos_tags([token])[0][1] other_pos = pos_tags([other])[0][1] distance += int(simplify_tag(token_pos) != simplify_tag(other_pos)) if Distance.NAIVE in metrics: distance += int(token_lemma != other_lemma) if Distance.LENGTH in metrics: distance += abs(len(token_lemma) - len(other_lemma)) if Distance.LEVENSHTEIN in metrics: distance += edit_distance(token_lemma, other_lemma) if any(d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}): try: synset1, synset2 = wn.synsets(token_lemma)[0], wn.synsets(other_lemma)[0] except IndexError: distance += len([d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}]) return distance / len(metrics) if Distance.PATH in metrics: distance += 1 - wn.similarity.path(synset1, synset2) if Distance.WU_PALMER in metrics: distance += 1 - wn.similarity.wup(synset1, synset2) if Distance.LEACOCK_CHORDOROW in metrics: distance += 1 - wn.similarity.lch(synset1, synset2) return distance / len(metrics)
def seed_docs(lang, doc, wn): """Print the fields to be added, and seed them if possible""" for relname in gwadoc.RELATIONS: relili = gwadoc.relations[relname].proj.ili or '' forms = [] defs = [] if relili and wn.synsets(ili=relili): ## normally only one for s in wn.synsets(ili=relili): if s.definition(): defs.append(definition) ## better to order by frequency if known (rarely known) for w in s.words(): for f in w.forms(): forms.append(f) print (f"""\n\n### {relname} {relili}\n""", file=doc) ### Name print (f'# relations.{relname}.name.{lang} = "{", ".join(forms)}"', file=doc) ### Short definition print (f'# relations.{relname}.df.{lang} = "{"; ".join(defs)}"', file=doc) ### Short example print (f'# relations.{relname}.ex.{lang} = ""', file=doc)
def test_hypernym_paths(): information = wn.synsets('information')[0] example = wn.synsets('example')[0] sample = wn.synsets('sample')[0] random_sample = wn.synsets('random sample')[0] assert information.hypernym_paths() == [] assert example.hypernym_paths() == [[information]] assert sample.hypernym_paths() == [[example, information]] assert random_sample.hypernym_paths() == [[sample, example, information]]
def test_interlingual_hypernym_paths(): información = wn.synsets('información')[0] ejemplo = wn.synsets('ejemplo')[0] inferred = wn.Synset.empty('*INFERRED*') muestra_aleatoria = wn.synsets('muestra aleatoria')[0] assert información.hypernym_paths() == [] assert ejemplo.hypernym_paths() == [[información]] assert muestra_aleatoria.hypernym_paths() == [[ inferred, ejemplo, información ]]
def gloss(self, word): # do not volunteer the gloss (definition) for words not in the vocabulary if word not in self.items: return None synsets = wn.synsets(word) # make a difference between None for words not in vocabulary and words # in the vocabulary that do not have a gloss in WordNet return synsets[0].definition() if synsets else 'NO DEFINITION'
def test_path(): information = wn.synsets('information')[0] example = wn.synsets('example')[0] sample = wn.synsets('sample')[0] random_sample = wn.synsets('random sample')[0] datum = wn.synsets('datum')[0] exemplify = wn.synsets('exemplify')[0] assert sim.path(information, information) == 1/1 assert sim.path(information, example) == 1/2 assert sim.path(information, sample) == 1/3 assert sim.path(information, random_sample) == 1/4 assert sim.path(random_sample, datum) == 1/5 assert sim.path(example, exemplify) == 1/4
def test_wup(): information = wn.synsets('information')[0] example = wn.synsets('example')[0] sample = wn.synsets('sample')[0] random_sample = wn.synsets('random sample')[0] datum = wn.synsets('datum')[0] exemplify = wn.synsets('exemplify')[0] assert sim.wup(information, information) == (2*1) / (0+0+2*1) assert sim.wup(information, example) == (2*1) / (0+1+2*1) assert sim.wup(information, sample) == (2*1) / (0+2+2*1) assert sim.wup(information, random_sample) == (2*1) / (0+3+2*1) assert sim.wup(random_sample, datum) == (2*1) / (3+1+2*1) assert sim.wup(example, exemplify) == (2*1) / (2+1+2*1)
def test_shortest_path(): information = wn.synsets('information')[0] example = wn.synsets('example')[0] sample = wn.synsets('sample')[0] random_sample = wn.synsets('random sample')[0] datum = wn.synsets('datum')[0] exemplify = wn.synsets('exemplify')[0] inferred_root = wn.Synset.empty('*INFERRED*') assert information.shortest_path(information) == [] assert information.shortest_path(datum) == [datum] assert information.shortest_path(sample) == [example, sample] assert sample.shortest_path(information) == [example, information] assert random_sample.shortest_path(datum) == [ sample, example, information, datum ] with pytest.raises(wn.Error): example.shortest_path(exemplify) assert example.shortest_path(exemplify, simulate_root=True) == [ information, inferred_root, exemplify ]
def pos(self, word): # do not volunteer the pos for words not in the vocabulary if word not in self.items: return None synsets = wn.synsets(word) return synsets[0].pos() if synsets else 'n'
def test_synsets_mini(): assert len(wn.synsets()) == 12 assert all(isinstance(ss, wn.Synset) for ss in wn.synsets()) synsets = wn.synsets('information') # search lemma assert len(synsets) == 1 assert 'information' in synsets[0].lemmas() synsets = wn.synsets('exemplifies') # search secondary form assert len(synsets) == 1 assert 'exemplify' in synsets[0].lemmas() assert len(wn.synsets(pos='n')) == 9 assert len(wn.synsets(pos='v')) == 3 assert len(wn.synsets(pos='q')) == 0 # fake pos assert len(wn.synsets(ili='i67469')) == 2 assert len(wn.synsets(ili='i67468')) == 0 assert len(wn.synsets(lang='en')) == 8 assert len(wn.synsets(lang='es')) == 4 assert len(wn.synsets(lexicon='test-en')) == 8 assert len(wn.synsets(lexicon='test-es')) == 4 assert len(wn.synsets(lang='en', lexicon='test-en')) == 8 assert len(wn.synsets(pos='v', lang='en')) == 2 assert len(wn.synsets('information', lang='en')) == 1 assert len(wn.synsets('information', lang='es')) == 0 assert len(wn.synsets(ili='i67469', lang='es')) == 1 with pytest.raises(wn.Error): wn.synsets(lang='unk') with pytest.raises(wn.Error): wn.synsets(lexicon='test-unk')
def test_synsets_empty(): assert len(wn.synsets()) == 0
] #lematizare in engleza print("Fraza lemmatizata engleza", lemmatized_sentence_en ) #nu am reusit sa il fac sa mearga si in romana stemmer_en = snowballstemmer.stemmer('english') stemmer_sentence_en = stemmer_en.stemWords(lemmatized_sentence_en) print("Fraza dupa stemmer in en ", stemmer_sentence_en) else: #teste: #posibila alternativa lematizare in romana : https://github.com/dumitrescustefan/RoWordNet wn = rwn.RoWordNet() cuvant_initial = 'carte' #stemmer_ro = snowballstemmer.stemmer('romanian'); # stemmer_sentence_ro = stemmer_ro.stemWords([cuvant_initial]) # print(stemmer_sentence_ro) # synset_ids = wn.synsets(literal=stemmer_sentence_ro[0]) synset_ids = wn.synsets(literal=cuvant_initial) if len(synset_ids) >= 1: for synset_id in synset_ids: print("Posibila lematizare pt ", cuvant_initial, ": literals=", wn(synset_id).literals, " tip=", wn(synset_id).pos) else: print("NU are lematizare in acest modul: ", cuvant_initial) #wn.download('ronwn') #w = wn.words('arbusti')[0] #print(w.lemma()) #nltk.download() #print("NLTK wordnet languages:", sorted(wn_nltk_test.langs()))
core = [] for l in open('wn-core-ili.tab'): core.append(l.strip()) #print(core) def link(text, url): return (f"<a href='{url}'>{text}</a>") stats = list() for l in wn.lexicons(): ### Fixme link for wordnet license incore = len( [s for s in wn.synsets(lexicon=l.id) if s.ili and (s.ili.id in core)]) synsets = len(wn.synsets(lexicon=l.id)) data = f""" <tr> <th>{l.specifier()}</th> <td>{l.language}</td> <td>{link(l.label, l.url)}</td> <td align='right'>{synsets:,d}</td> <td align='right'>{len(wn.senses(lexicon=l.id)):,d}</td> <td align='right'>{len(wn.words(lexicon=l.id)):,d}</td> <td align='right'>{incore/len(core):.1%}</td> <td>{link(licenses[l.license], l.license)}</td> </tr>""" stats.append(data) headers = "ID:ver Lang Label Synsets Senses Words Core License".split()
def test_max_depth(): assert wn.synsets('information')[0].max_depth() == 0 assert wn.synsets('example')[0].max_depth() == 1 assert wn.synsets('sample')[0].max_depth() == 2 assert wn.synsets('random sample')[0].max_depth() == 3