Exemple #1
0
def token_distance(token: str, other: str, metrics: Collection[Distance] = {Distance.NAIVE}) -> float:
    distance = 0.0
    token_lemma = lemmatize(token)
    other_lemma = lemmatize(other)

    if Distance.POS in metrics:
        token_pos = pos_tags([token])[0][1]
        other_pos = pos_tags([other])[0][1]
        distance += int(simplify_tag(token_pos) != simplify_tag(other_pos))
    if Distance.NAIVE in metrics:
        distance += int(token_lemma != other_lemma)
    if Distance.LENGTH in metrics:
        distance += abs(len(token_lemma) - len(other_lemma))
    if Distance.LEVENSHTEIN in metrics:
        distance += edit_distance(token_lemma, other_lemma)
    if any(d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}):
        try:
            synset1, synset2 = wn.synsets(token_lemma)[0], wn.synsets(other_lemma)[0]
        except IndexError:
            distance += len([d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}])
            return distance / len(metrics)
        if Distance.PATH in metrics:
            distance += 1 - wn.similarity.path(synset1, synset2)
        if Distance.WU_PALMER in metrics:
            distance += 1 - wn.similarity.wup(synset1, synset2)
        if Distance.LEACOCK_CHORDOROW in metrics:
            distance += 1 - wn.similarity.lch(synset1, synset2)

    return distance / len(metrics)
Exemple #2
0
def seed_docs(lang, doc, wn):
    """Print the fields to be added, and seed them if possible"""
    for relname in gwadoc.RELATIONS:
        relili = gwadoc.relations[relname].proj.ili or ''
        forms = []
        defs = []
        if relili and wn.synsets(ili=relili):
            ## normally only one
            for s in wn.synsets(ili=relili):
                if s.definition():
                    defs.append(definition)
                ## better to order by frequency if known (rarely known)
                for w in s.words():
                    for f in w.forms():
                        forms.append(f)
        
        print (f"""\n\n### {relname} {relili}\n""", file=doc)
        
        ### Name
        print (f'# relations.{relname}.name.{lang} = "{", ".join(forms)}"',
               file=doc)
            
        ### Short definition
        print (f'# relations.{relname}.df.{lang} = "{"; ".join(defs)}"',
               file=doc)

        ### Short example
        print (f'# relations.{relname}.ex.{lang} = ""',
               file=doc)
Exemple #3
0
def test_hypernym_paths():
    information = wn.synsets('information')[0]
    example = wn.synsets('example')[0]
    sample = wn.synsets('sample')[0]
    random_sample = wn.synsets('random sample')[0]
    assert information.hypernym_paths() == []
    assert example.hypernym_paths() == [[information]]
    assert sample.hypernym_paths() == [[example, information]]
    assert random_sample.hypernym_paths() == [[sample, example, information]]
Exemple #4
0
def test_interlingual_hypernym_paths():
    información = wn.synsets('información')[0]
    ejemplo = wn.synsets('ejemplo')[0]
    inferred = wn.Synset.empty('*INFERRED*')
    muestra_aleatoria = wn.synsets('muestra aleatoria')[0]
    assert información.hypernym_paths() == []
    assert ejemplo.hypernym_paths() == [[información]]
    assert muestra_aleatoria.hypernym_paths() == [[
        inferred, ejemplo, información
    ]]
Exemple #5
0
 def gloss(self, word):
     # do not volunteer the gloss (definition) for words not in the vocabulary
     if word not in self.items:
         return None
     synsets = wn.synsets(word)
     # make a difference between None for words not in vocabulary and words
     # in the vocabulary that do not have a gloss in WordNet
     return synsets[0].definition() if synsets else 'NO DEFINITION'
Exemple #6
0
def test_path():
    information = wn.synsets('information')[0]
    example = wn.synsets('example')[0]
    sample = wn.synsets('sample')[0]
    random_sample = wn.synsets('random sample')[0]
    datum = wn.synsets('datum')[0]
    exemplify = wn.synsets('exemplify')[0]
    assert sim.path(information, information) == 1/1
    assert sim.path(information, example) == 1/2
    assert sim.path(information, sample) == 1/3
    assert sim.path(information, random_sample) == 1/4
    assert sim.path(random_sample, datum) == 1/5
    assert sim.path(example, exemplify) == 1/4
Exemple #7
0
def test_wup():
    information = wn.synsets('information')[0]
    example = wn.synsets('example')[0]
    sample = wn.synsets('sample')[0]
    random_sample = wn.synsets('random sample')[0]
    datum = wn.synsets('datum')[0]
    exemplify = wn.synsets('exemplify')[0]
    assert sim.wup(information, information) == (2*1) / (0+0+2*1)
    assert sim.wup(information, example) == (2*1) / (0+1+2*1)
    assert sim.wup(information, sample) == (2*1) / (0+2+2*1)
    assert sim.wup(information, random_sample) == (2*1) / (0+3+2*1)
    assert sim.wup(random_sample, datum) == (2*1) / (3+1+2*1)
    assert sim.wup(example, exemplify) == (2*1) / (2+1+2*1)
Exemple #8
0
def test_shortest_path():
    information = wn.synsets('information')[0]
    example = wn.synsets('example')[0]
    sample = wn.synsets('sample')[0]
    random_sample = wn.synsets('random sample')[0]
    datum = wn.synsets('datum')[0]
    exemplify = wn.synsets('exemplify')[0]
    inferred_root = wn.Synset.empty('*INFERRED*')
    assert information.shortest_path(information) == []
    assert information.shortest_path(datum) == [datum]
    assert information.shortest_path(sample) == [example, sample]
    assert sample.shortest_path(information) == [example, information]
    assert random_sample.shortest_path(datum) == [
        sample, example, information, datum
    ]
    with pytest.raises(wn.Error):
        example.shortest_path(exemplify)
    assert example.shortest_path(exemplify, simulate_root=True) == [
        information, inferred_root, exemplify
    ]
Exemple #9
0
 def pos(self, word):
     # do not volunteer the pos for words not in the vocabulary
     if word not in self.items:
         return None
     synsets = wn.synsets(word)
     return synsets[0].pos() if synsets else 'n'
Exemple #10
0
def test_synsets_mini():
    assert len(wn.synsets()) == 12
    assert all(isinstance(ss, wn.Synset) for ss in wn.synsets())

    synsets = wn.synsets('information')  # search lemma
    assert len(synsets) == 1
    assert 'information' in synsets[0].lemmas()

    synsets = wn.synsets('exemplifies')  # search secondary form
    assert len(synsets) == 1
    assert 'exemplify' in synsets[0].lemmas()

    assert len(wn.synsets(pos='n')) == 9
    assert len(wn.synsets(pos='v')) == 3
    assert len(wn.synsets(pos='q')) == 0  # fake pos

    assert len(wn.synsets(ili='i67469')) == 2
    assert len(wn.synsets(ili='i67468')) == 0

    assert len(wn.synsets(lang='en')) == 8
    assert len(wn.synsets(lang='es')) == 4

    assert len(wn.synsets(lexicon='test-en')) == 8
    assert len(wn.synsets(lexicon='test-es')) == 4

    assert len(wn.synsets(lang='en', lexicon='test-en')) == 8
    assert len(wn.synsets(pos='v', lang='en')) == 2
    assert len(wn.synsets('information', lang='en')) == 1
    assert len(wn.synsets('information', lang='es')) == 0
    assert len(wn.synsets(ili='i67469', lang='es')) == 1

    with pytest.raises(wn.Error):
        wn.synsets(lang='unk')
    with pytest.raises(wn.Error):
        wn.synsets(lexicon='test-unk')
Exemple #11
0
def test_synsets_empty():
    assert len(wn.synsets()) == 0
Exemple #12
0
        ]  #lematizare in engleza
        print("Fraza lemmatizata engleza", lemmatized_sentence_en
              )  #nu am reusit sa il fac sa mearga si in romana

        stemmer_en = snowballstemmer.stemmer('english')
        stemmer_sentence_en = stemmer_en.stemWords(lemmatized_sentence_en)
        print("Fraza dupa stemmer in en ", stemmer_sentence_en)

else:  #teste:
    #posibila alternativa lematizare in romana : https://github.com/dumitrescustefan/RoWordNet
    wn = rwn.RoWordNet()
    cuvant_initial = 'carte'
    #stemmer_ro = snowballstemmer.stemmer('romanian');
    # stemmer_sentence_ro = stemmer_ro.stemWords([cuvant_initial])
    # print(stemmer_sentence_ro)
    # synset_ids = wn.synsets(literal=stemmer_sentence_ro[0])
    synset_ids = wn.synsets(literal=cuvant_initial)
    if len(synset_ids) >= 1:
        for synset_id in synset_ids:
            print("Posibila lematizare pt ", cuvant_initial, ": literals=",
                  wn(synset_id).literals, " tip=",
                  wn(synset_id).pos)
    else:
        print("NU are lematizare in acest modul: ", cuvant_initial)

    #wn.download('ronwn')
    #w = wn.words('arbusti')[0]
    #print(w.lemma())
    #nltk.download()
    #print("NLTK wordnet languages:",  sorted(wn_nltk_test.langs()))
Exemple #13
0
core = []
for l in open('wn-core-ili.tab'):
    core.append(l.strip())
#print(core)


def link(text, url):
    return (f"<a href='{url}'>{text}</a>")


stats = list()
for l in wn.lexicons():
    ### Fixme  link for wordnet license
    incore = len(
        [s for s in wn.synsets(lexicon=l.id) if s.ili and (s.ili.id in core)])
    synsets = len(wn.synsets(lexicon=l.id))
    data = f"""  <tr>
    <th>{l.specifier()}</th>
    <td>{l.language}</td>
    <td>{link(l.label, l.url)}</td>
    <td align='right'>{synsets:,d}</td>
    <td align='right'>{len(wn.senses(lexicon=l.id)):,d}</td>
    <td align='right'>{len(wn.words(lexicon=l.id)):,d}</td>
    <td align='right'>{incore/len(core):.1%}</td>
    <td>{link(licenses[l.license], l.license)}</td> 
    </tr>"""
    stats.append(data)

headers = "ID:ver Lang Label Synsets Senses Words Core License".split()
Exemple #14
0
def test_max_depth():
    assert wn.synsets('information')[0].max_depth() == 0
    assert wn.synsets('example')[0].max_depth() == 1
    assert wn.synsets('sample')[0].max_depth() == 2
    assert wn.synsets('random sample')[0].max_depth() == 3