Example #1
0
def nounify(verb_word):
    set_of_related_nouns = set()

    l1 = wn.morphy(verb_word, wn.VERB)
    if (l1 is not None):
        print(l1)
        for lemma in wn.lemmas(l1, pos="v"):
            for related_form in lemma.derivationally_related_forms():
                for synset in wn.synsets(related_form.name(), pos=wn.NOUN):
                    if wn.synset('person.n.01') in synset.closure(
                            lambda s: s.hypernyms()):
                        set_of_related_nouns.add(synset)

    l1 = wn.morphy(verb_word, wn.ADJ)
    if (l1 is not None):
        print(l1)
        for lemma in wn.lemmas(l1, pos="a"):
            for related_form in lemma.derivationally_related_forms():
                for synset in wn.synsets(related_form.name(), pos=wn.NOUN):
                    if wn.synset('person.n.01') in synset.closure(
                            lambda s: s.hypernyms()):
                        set_of_related_nouns.add(synset)

        for lemma in wn.lemmas(l1, pos="s"):
            for related_form in lemma.derivationally_related_forms():
                for synset in wn.synsets(related_form.name(), pos=wn.NOUN):
                    if wn.synset('person.n.01') in synset.closure(
                            lambda s: s.hypernyms()):
                        set_of_related_nouns.add(synset)

    return set_of_related_nouns
def print_other_lexical_rel():
    good1 = wn.synset('good.a.01')
    wn.lemmas('good')
    print("Antonyms of 'good': " + str(good1.lemmas()[0].antonyms()))
    print("")
    print("Entailment of 'walk': " + str(wn.synset('walk.v.01').entailments()))
    print("")
Example #3
0
def wordNet_features(lidx_start, lidx_end, ridx_start, ridx_end, pos_dict):

    all_tokens = [tok.lower() for tok, span in pos_dict.values()]

    features = []
    try:
        sims = set(wn.synsets(all_tokens[lidx_start])).intersection(
            wn.synsets(all_tokens[ridx_start]))
        if len(sims) > 0:
            features.append(1)
        else:
            features.append(0)
    except:
        features.append(0)

    try:
        lderiv = set(
            itertools.chain.from_iterable([
                lemma.derivationally_related_forms()
                for lemma in wn.lemmas(all_tokens[lidx_start])
            ]))
        rderiv = set(
            itertools.chain.from_iterable([
                lemma.derivationally_related_forms()
                for lemma in wn.lemmas(all_tokens[ridx_start])
            ]))
        if len(lderiv.intersection(rderiv)) > 0:
            features.append(1)
        else:
            features.append(0)
    except:
        features.append(0)

    return features
Example #4
0
def isCompound(tok1, tok2):
    if tok1 is None or tok2 is None:
        return False
    l1 = tok1.stem
    l2 = tok2.stem
    ll = [l1,l2]
    return wn.lemmas(''.join(ll)) or wn.lemmas('_'.join(ll)) or wn.lemmas('-'.join(ll))
Example #5
0
def isCompound(tok1, tok2):
    if tok1 is None or tok2 is None:
        return False
    l1 = tok1.stem
    l2 = tok2.stem
    ll = [l1,l2]
    return wn.lemmas(''.join(ll)) or wn.lemmas('_'.join(ll)) or wn.lemmas('-'.join(ll))
def path_similarity(term1, term2):
    '''
    Evaluates the path similarity between two italian words using WordNet. 
    :param term1: The first word. Must be a dictionary with the following keys: "lemma": the lemma, "pos": the pos tag
    :param term2: The first word. Must be a dictionary with the following keys: "lemma": the lemma, "pos": the pos tag
    :return: the path similarity
    :rtype: float
    '''    
    if term1['pos'] in ['r', 'n', 'a', 'v']:
        lemma1 = wn.lemmas(term1['lemma'], lang='ita', pos=term1['pos'])
    else:
        lemma1 = wn.lemmas(term1['lemma'], lang='ita')

    if term2['pos'] in ['r', 'n', 'a', 'v']:
        lemma2 = wn.lemmas(term2['lemma'], lang='ita', pos=term2['pos'])
    else:
        lemma2 = wn.lemmas(term2['lemma'], lang='ita')

    if (len(lemma2) == 0 or len(lemma1) == 0) and term1['lemma'] == term2['lemma']:
        return 1
    if len(lemma1) == 0 or len(lemma2) == 0:
        return 0
    
    synset1 = lemma1[0].synset()
    synset2 = lemma2[0].synset()
	
    tmp = synset1.path_similarity(synset2)
    if tmp is None:
        return 0
    else:
        return tmp
Example #7
0
def print_other_lexical_rel():
    good1 = wn.synset('good.a.01')
    wn.lemmas('good')
    print("Antonyms of 'good': " + str(good1.lemmas()[0].antonyms()))
    print("")
    print("Entailment of 'walk': " + str(wn.synset('walk.v.01').entailments()))
    print("")
Example #8
0
def is_eventive_adverb(token):
    """
    Returns true if the given token is an eventive adverb.  We use simple
    and conservative heuristics that detects adverbs originated from present
    or past participles of verbs as events, e.g., `sparklingly' and
    `unexpectedly'.
    """
    word = token.text.lower()

    # Remove a prefix, if any, to make verb conjugation and checking easier below.
    morphemes = get_morphemes(word)
    if morphemes and morphemes[0] in PREFIXES:
        word = word[len(morphemes[0]):]
    if len(word) == 0:
        return False

    # Pertainyms are relational adjectives
    lm_adjs = [
        lm_ptn for lm in wn.lemmas(word, pos=wn.ADV)
        for lm_ptn in lm.pertainyms()
    ]
    if word.endswith('ly'):
        lm_adjs.extend(wn.lemmas(word[:-2]))
    words = set([lm_adj.name() for lm_adj in lm_adjs])
    for word in words:
        conj_inf = conjugate(word, 'inf')
        if wn.lemmas(conj_inf, wn.VERB):
            if conjugate(conj_inf, 'ppart') == word:
                return True
            if conjugate(conj_inf, 'part') == word:
                return True

    return False
Example #9
0
def func1():
    while True:
        word = input('Enter a word: ')
        print('Synsets:')
        for i, w in enumerate(wn.synsets(word)):
            print("{0}) '{1}' -- definition: '{2}'".format(
                i, w.name(), w.definition()))

        print('Lemmas:')
        for i, w in enumerate(wn.lemmas(word)):
            print("{0}) '{1}'".format(i, str(w)))
        print('')
        #continue
        lem_id = int(input('Enter an lemma ID: '))
        l = wn.lemmas(word)[lem_id]
        print(l, l.definition())
        continue
        syn_id = int(input('Enter an synset ID: '))

        synset = wn.synsets(word)[syn_id]
        print("Lemmas in %s:" % str(synset))
        if len(wn.synsets(word)) > 0:
            for i, l in enumerate(synset.lemmas()):
                print("{0}) '{1}'".format(i, l))
        lem_id = int(input('Enter an lemma ID: '))
        lemma = synset.lemmas()[lem_id]
        print([x for x in dir(lemma) if not re.match('^_.+', x)])
        print("<%s> :" % str(lemma))
        print('synset - ', lemma.synset())
        print('hypernyms - ', lemma.hypernyms())
        print('hyponyms - ', lemma.hyponyms())
        print('pertainyms - ', lemma.pertainyms())
        print('antonyms - ', lemma.antonyms())
        print('usage_domains - ', lemma.usage_domains())
Example #10
0
def lemma_builder(words):
    """
    returns a list of lemmas from a list of words
    """
    lemmas = []
    for word in words:
        if wn.lemmas(word):
            lemmas.append(wn.lemmas(word)[0].name)
    return lemmas
def synonyms(wordlist):
    wlist=[]    
    for word in wordlist:              
        print word        
        if wordnet.lemmas(word):
            string=str(wordnet.lemmas(word)[0])                                        
            
            wlist+=re.findall("Lemma\(\'(.+?)\.\w",string)
            print re.findall("Lemma\(\'(.+?)\.\w",string)
        else:
            wlist+=word
    return wlist
Example #12
0
def get_reasonable_synsets(word):
    lemmas = wordnet.lemmas(word)
    cnet_lemma, _pos = LEMMATIZER.lookup('en', word)
    if cnet_lemma != word:
        lemmas += wordnet.lemmas(cnet_lemma)
    good_synsets = []
    for lem in lemmas:
        syn = lem.synset()
        if syn.lemma_names()[0] == word:
            good_synsets.append(syn)
    if not good_synsets:
        return [lem.synset() for lem in lemmas]
    else:
        return good_synsets
Example #13
0
def wordnet_vector(concept, conceptlist):
    from nltk.corpus import wordnet
    start_points = wordnet.lemmas(concept.replace(' ', '_'))
    if not start_points: return None
    results = divisi2.DenseVector(None, conceptlist)
    for concept2 in conceptlist:
        end_points = wordnet.lemmas(concept2.replace(' ', '_'))
        best_sim = 0.0
        for start_point in start_points:
            for end_point in end_points:
                sim = start_point.synset.wup_similarity(end_point.synset)
                if sim > best_sim: best_sim = sim
        results[results.index(concept2)] = best_sim
    return results
    def predict_nearest(self, context):

        lemma = context.lemma
        pos = context.pos

        similarity = {}

        LemmaList = wn.lemmas(lemma, pos=pos)

        for lemmaTemp in LemmaList:

            LemmaListNew = lemmaTemp.synset().lemmas()

            # print(LemmaListNew)

            for l in LemmaListNew:
                name = l.name()
                # print(name)
                if name != lemma:
                    if name not in self.model.vocab:
                        continue
                    #if "_" in name:
                    # name = name.replace("_", " ")
                    # continue
                    if name not in similarity:
                        similarity[name] = self.model.similarity(name, lemma)

        SimilarityList = sorted(similarity.items(),
                                key=lambda x: x[1],
                                reverse=True)

        return SimilarityList[0][0]
    def predict_nearest(self, context):
        stop_words = stopwords.words('english')
        target_lemma, target_pos = context.lemma, context.pos
        v1 = self.model.wv[target_lemma]
        possible_synonyms = {}
        for lexeme in wn.lemmas(target_lemma, pos=target_pos):
            synset = lexeme.synset()
            for w in synset.lemmas():
                synonym = w.name().replace('_', ' ')
                synonym_words = synonym.split()
                # if synonym consists of only one word and it is embedded in wv
                # calculate the cosine distance between the target and synonym
                # if synonym not in wv use 'UNK' embedding
                if len(synonym_words
                       ) == 1 and synonym_words[0] in self.model.wv:
                    v2 = self.model.wv[synonym_words[0]]
                    possible_synonyms[synonym] = self.cos(v1, v2)
                else:
                    # if the synonym compose of multiple words, e.g take up, in order to
                    # first get rid of the stop words, then find the first word in wv if available
                    # calculate the cosine distance between the first word and target
                    # use the first word as the prediction
                    synonym_words = [
                        w for w in synonym_words if w not in stop_words
                    ]
                    if synonym_words[0] in self.model.wv:
                        v2 = self.model.wv[synonym_words[0]]
                        possible_synonyms[synonym_words[0]] = self.cos(v1, v2)

        possible_synonyms.pop(target_lemma, None)
        predictor = [
            k for k, v in possible_synonyms.items()
            if v == max(possible_synonyms.values())
        ][0]
        return predictor  # replace for part 4
Example #16
0
def get_word_sense_vectors(candidate):
    vectors = {}
    try:
        candidate_vec = glove[candidate]
    except Exception:
        # print(candidate, "not found in glove")
        return None
    for sense in wn.lemmas(candidate):
        # if candidate == "bank":
        # print("synonym of ", candidate, " is ", ss.lemmas()[0].name())
        # print("key of ", candidate, " is ", ss.lemmas()[0].key())
        gloss = [sense.synset().definition()]
        gloss.extend(sense.synset().examples())
        word_vectors = []
        for sentence in gloss:
            tokens = nltk.word_tokenize(sentence)
            pos_tags = nltk.pos_tag(tokens)
            for gloss_pos, tag in pos_tags:
                if get_valid_pos_tag(tag):
                    try:
                        gloss_word_vec = glove[gloss_pos]
                    except Exception:
                        # print(gloss_pos, "not found in glove")
                        continue
                    cos_sim = dot(gloss_word_vec, candidate_vec) / (norm(gloss_word_vec) * norm(candidate_vec))
                    if cos_sim > cosine_sim_threshold:
                        word_vectors.append(gloss_word_vec)
        if len(word_vectors) == 0:
            continue
        sense_vector = average(word_vectors, 0)
        vectors[sense] = sense_vector
    return vectors
Example #17
0
def get_w2v_embeddings_from_pretrained_googlenews_wordnet(
        pretrained_embedding_fpath, save_path):

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))

    tic = time.time()
    print(
        'Please wait ... (it could take a while to load the file : {})'.format(
            pretrained_embedding_fpath))
    model = gensim.models.KeyedVectors.load_word2vec_format(
        pretrained_embedding_fpath, binary=True)
    print('Done.  (time used: {:.1f}s)\n'.format(time.time() - tic))

    embedding_weights = {}
    found_cnt = 0

    for word in tqdm.tqdm(model.vocab, desc="Filtering Words Using WordNet"):
        if has_digits(word) or len(word) < 3:
            continue
        if len(wordnet.lemmas(word)) > 0:
            embedding_weights[word] = model.word_vec(word)
            found_cnt += 1

    save2pickle(save_path, embedding_weights)
Example #18
0
def _lemmata_by_freq(query, pos):
    """Return lemmata for query, sorted descending by frequency.

    """

    lemmata = wordnet.lemmas(query, pos)
    return sorted(lemmata, key=lambda lemma: lemma.count(), reverse=True)
Example #19
0
def thesaurus(word):
    ss = wn.synsets(word)
    lms = wn.lemmas(word)
    if ss:
        pt = 200
        defl = ""
        for lem in lms:
            if lem.count() == 0:
                lem_count = 100
            else:
                lem_count = lem.count()
            if lem._lexname_index == 0:
                lem_lex_index = 1
            else:
                lem_lex_index = lem._lexname_index / 100

            if pt > lem_count + lem_lex_index:
                pt = lem_count + lem_lex_index
                defl = lem.synset().definition()
        #                defl=defl+markgreen('['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>')
        #           else:
        #                defl=defl+'['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>'

        #    ssddf = ss[0].definition()

        #     for ssitem in ss:
        #          curdegf=ssitem.lexname
        return defl  # ss[0].definition()
    else:
        return ""
Example #20
0
def is_eventive_adjective(token):
    """
    Returns true if the given token is an eventive adjective.  We use simple
    and conservative heuristics that detects adjectives originated from present
    or past participles of verbs as events, e.g., `sparkling' and 'man-made'.
    """
    word = token.text.lower()
    if '-' in word:  # e.g., 'well-known'
        # Finding a head subword is difficult.  First, we check if
        # there is a verb in subwords.  If not, use the last token.
        verb = None
        for subword, pos in reversed(nltk.pos_tag(word.split('-'))):
            if pos.startswith('VB'):
                verb = subword  # found a verb
                break
        word = verb if verb is not None else word.split('-')[-1]

    # Remove a prefix, if any, to make verb conjugation and checking easier below.
    morphemes = get_morphemes(word)
    if morphemes and morphemes[0] in PREFIXES:
        word = word[len(morphemes[0]):]
    if len(word) == 0:
        return False

    conj_inf = conjugate(word, 'inf')
    if wn.lemmas(conj_inf, wn.VERB):
        if conjugate(conj_inf, 'ppart') == word:
            return True
        if conjugate(conj_inf, 'part') == word:
            return True

    return False
Example #21
0
def best_synset(word_str, pos_tag='n'):
    if isinstance(word_str, Token):
        word_str, pos_tag = word_str.text.lower(), word_str.pos_
    assert isinstance(word_str, str)
    assert isinstance(pos_tag, str)

    lemma = lemmatize(word_str.lower())
    if lemma:
        lemma = lemma[0]
    tag = to_wordnet_tag(pos_tag)

    try:
        if lemma and pos_tag:
            synset = wn.synset('{}.{}.{}'.format(lemma, tag, '01'))
            if synset:
                return synset
            raise WordNetError
    except WordNetError:
        try:
            lemmas = wn.lemmas(lemma)
            if lemmas:
                synset = lemmas[0].synset()
                if synset:
                    return synset
            raise WordNetError
        except WordNetError:
            pass
Example #22
0
def thesaurus(word):
    ss = wn.synsets(word)
    lms = wn.lemmas(word)
    if ss:
        pt = 200
        defl = ''
        for lem in lms:
            if lem.count() == 0:
                lem_count = 100
            else:
                lem_count = lem.count()
            if lem._lexname_index == 0:
                lem_lex_index = 1
            else:
                lem_lex_index = lem._lexname_index / 100

            if pt > lem_count + lem_lex_index:
                pt = lem_count + lem_lex_index
                defl = lem.synset().definition()


#                defl=defl+markgreen('['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>')
#           else:
#                defl=defl+'['+str(lem.count())+'.'+str(lem._lexname_index)+']'+lem.synset().definition()+';<br>'

#    ssddf = ss[0].definition()

#     for ssitem in ss:
#          curdegf=ssitem.lexname
        return defl  # ss[0].definition()
    else:
        return ''
Example #23
0
def hyponyms(word):
    hyponyms = []
    for lemma in wn.lemmas(word):
        for hyponym in lemma.synset.hyponyms():
	    for word in hyponym.lemma_names:
		hyponyms.append(word.replace('_',' '))
    return hyponyms
Example #24
0
def get_sinonimos(palavra: str, lang: str) -> list:
    """
    Execute a consulta de sinônimos na wordnet.
    :param palavra: palavra que se deve bucar os sinônimos
    :param lang: linguagem(abreviação) dos quais os sinônimos devem ser tradidos
    :return: Lista de string com os sinônimos retornados pela wordnet. A string estará no formato "^.*\.(a|v|n).([0-9][0-9])\..*$"
    """
    logger.info("Buscando sinonimos: [palavra=%s, lang=%s]", palavra, lang)
    sinonimos = set()

    lemmasDaPalavra = wn.lemmas(palavra, lang=SinonimosConstantes.LEMMAS_LANG)
    logger.debug("Lemmas da palavra '%s' com lang=%s: %s", palavra, SinonimosConstantes.LEMMAS_LANG, lemmasDaPalavra)

    for lemma in lemmasDaPalavra:
        synsetNome = lemma.synset().name()
        synsetLemmas = wn.synset(synsetNome).lemmas(lang)

        for synsetLemma in synsetLemmas:
            synsetLammaName = synsetLemma.name()
            synsetLemmaSynsetName = synsetLemma.synset().name()

            sinonimo = '.'.join([synsetLemmaSynsetName, synsetLammaName])
            sinonimos.add(sinonimo)

            logger.debug("[lemma=%s] = [synsetNome=%s, synsetLemmas=%s] = [synsetLammaName=%s, synsetLemmaSynsetName=%s] = [sinonimo=%s]",
                         lemma, synsetNome, synsetLemmas, synsetLammaName, synsetLemmaSynsetName, sinonimo)

    logger.info("Sinonimos obtidos: %s", str(sinonimos))
    return list(sinonimos)
def get_more_candidates(lemma, pos, depth=2):
    # Return solution as a set to make sure unique lemmas are returned
    possible_synonyms = set([])

    # Retrieve all lexemes for the particular lemma and pos
    lexemes = wn.lemmas(lemma, pos=pos)

    # Iterate over lexemes
    for lexeme in lexemes:
        # Get the synset for current lexeme
        synset = lexeme.synset()

        # Get the lexemes from the synset
        for candidate_lemma in synset.lemmas():
            # Retrieve the name from a lemma structure
            candidate_lemma_name = candidate_lemma.name()

            # Make sure we don't add input lemma as solution
            if candidate_lemma_name != lemma:
                # Check if lemma contains multiple words
                if len(candidate_lemma_name.split('_')) > 1:
                    # Replace '_' with ' ', e.g. 'turn_around' -> 'turn around'
                    candidate_lemma_name = candidate_lemma_name.replace('_', ' ')

                # Add lemma to the solution
                possible_synonyms.add(candidate_lemma_name)

    return possible_synonyms
Example #26
0
    def get_variant_words(cls, input_word):
        input_word = wn.lemmas(input_word)

        return set([
            form.name() for lem in input_word
            for form in lem.derivationally_related_forms()
        ])
Example #27
0
def reconstruct(s_element):
    sentence = []
    wf_count = 1
    gnd_atoms = []
    for e in s_element.iter():
        if e.text is not None:
            sentence.append(e.text)
        if e.tag == 'wf':
            wf_count += 1
            word_const = '{}-{:f}'.format(e.text, wf_count)
            if e.get('pos', None) is not None:
                gnd_atoms.append('has_pos({},{})'.format(word_const, e.get('pos')))
            if e.get('lemma', None) is not None:
                lem = e.get('lemma')
                lexsn = e.get('lexsn')
                synset = None
                for l in wordnet.lemmas(lem):
                    if l.key == '{}%%{}'.format(lem, lexsn):
                        synset = l.synset
                if synset is not None:
                    sid = 's-{}'.format(word_const)
                    gnd_atoms.append('has_sense({},{})'.format(word_const, sid))
                    gnd_atoms.append('is_a({},{})'.format(sid, synset.name))
    sentence = ' '.join(sentence).strip()
    return sentence, gnd_atoms
def wn_frequency_predictor(context):
    # Counter with lemma as key and its count as value
    synonyms_counter = Counter()

    # Retrieve all lexemes for the particular lemma and pos
    lexemes = wn.lemmas(context.lemma, pos=context.pos)

    # Iterate over lexemes
    for lexeme in lexemes:
        # Get the synset for current lexeme
        synset = lexeme.synset()

        # Get the lemmas from the synset
        for candidate_lemma in synset.lemmas():
            candidate_lemma_name = candidate_lemma.name()

            # Make sure we don't add input lemma as solution
            if candidate_lemma_name != context.lemma:
                # Check if lemma contains multiple words
                if len(candidate_lemma_name.split('_')) > 1:
                    # Replace '_' with ' ', e.g. 'turn_around' -> 'turn around'
                    candidate_lemma_name = candidate_lemma_name.replace('_', ' ')

                # Add to the solution, the same lemma can be added twice
                # if it appears together with input lemma in multiple synsets
                synonyms_counter[candidate_lemma_name] += candidate_lemma.count()

    # If there is a tie, pick an arbitrary lemma, whatever comes first
    # in the front of the list after sorted descendingly
    return synonyms_counter.most_common(1)[0][0]
Example #29
0
def wn_simple_lesk_predictor(context: Context) -> str:
    stop_words = stopwords.words('english')
    overlap = defaultdict()
    ct = context.right_context + context.left_context
    ct = [i for i in ct if i not in stop_words and i not in string.punctuation]
    for i in wn.lemmas(context.lemma, context.pos):
        if i.synset().lemma_names() != [context.lemma]:
            defi = [j for j in i.synset().examples()]
            defi.append(i.synset().definition())
            for j in i.synset().hypernyms():
                defi = defi + j.examples()
                defi.append(j.definition())
            defi = [
                j for i in defi for j in tokenize(i)
                if j not in stop_words and j not in string.punctuation
            ]
            OV = [i in defi for i in ct]
            overlap[i] = sum(OV) / (len(OV) + 1)
    maximum = max(overlap, key=overlap.get)
    lemmadict = {
        maximum.synset().lemma_names()[i]:
        maximum.synset().lemmas()[i].count()
        for i in range(len(maximum.synset().lemmas()))
    }
    del lemmadict[context.lemma]
    maximum = max(lemmadict, key=lemmadict.get)
    return maximum.replace('_', ' ')
Example #30
0
def convert_pos_by_lemmas(word, from_pos, to_pos):
    """ Transform words given from/to POS tags """

    lemmas = wn.lemmas(word, pos=from_pos)
    # Word not found
    if not lemmas:
        return []

    # Get related forms
    derivationally_related_forms = [(l, l.derivationally_related_forms())
                                    for l in lemmas]

    # filter only the desired pos (consider 'a' and 's' equivalent)
    related_noun_lemmas = []

    for drf in derivationally_related_forms:
        for l in drf[1]:
            if l.synset().name().split('.')[1] == to_pos or \
                (to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)
                 and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)):
                related_noun_lemmas += [l]

    # Extract the words from the lemmas
    related_words = [l.name() for l in related_noun_lemmas]
    return list(set(related_words))
Example #31
0
def make_gerund(word):
    """
    Create gerund form of word if it's possible
    code -> coding
    :param word:
    :return:
    """
    # now = time.time()
    is_found = False
    gerund = word
    for lem in wn.lemmas(word):
        for related_form in lem.derivationally_related_forms():
            if related_form.name().endswith('ing'):
                gerund = related_form.name()
                GerundWords.objects.get_or_create(word=gerund)
                is_found = True
                break
        if is_found:
            break
    if not is_found:
        if word.endswith('e'):
            g_word = GerundWords.objects.filter(word='%sing' % gerund[:-1])
            if g_word:
                return g_word[0].word
        else:
            g_word = GerundWords.objects.filter(word='%sing' % gerund)
            if g_word:
                return g_word[0].word
    # logger.debug("Time get_gerund: {} {}".format(time.time() - now, word))
    return gerund
Example #32
0
def antonyms(word):
    antonyms = []
    for lemma in wn.lemmas(word):
        for antonym in lemma.antonyms():
	    for word in antonym.synset.lemma_names:
		antonyms.append(word.replace('_',' '))
    return antonyms
Example #33
0
def get_synonyms(term):
    """
    Funzione che trova i sinonimi di un termine

    Args:
        term: termine di cui cercare i sinonimi
    Returns:
        i sinonimi del termine in ingresso
    """
    term_lemmas = wn.lemmas(term, lang="ita")
    if term_lemmas == []:
        return ["Non sono stati trovati sinonimi per questo termine"]
    else:
        synonyms_lemma = term_lemmas[0].synset()
        synonyms_lemmas_ita = synonyms_lemma.lemmas(lang="ita")
        synonyms = list()
        for lemma in synonyms_lemmas_ita:
            if lemma.name() == term:
                pass
            else:
                lemma_str = str(lemma.name())
                lemma = lemma_str.replace('_', ' ')
                synonyms.append(lemma)
    if synonyms == []:
        return ["Non sono stati trovati sinonimi per questo termine"]
    return synonyms
def print_syn_lemmas(word):
    
    ## Synsets and Lemmas
    print("1. Synsets and Lemmas")
    print("Word: " + word)
    print("")
    print("Synsets:")
    [print(s) for s in wn.synsets(word)]
    print("")
    first_synset = wn.synsets(word)[0]
    print("First synset: " + str(first_synset))    
    print("")
    
    #word_synset = wn.synset("dog.n.01")
    print("Lemma names: ")
    [print(l) for l in first_synset.lemma_names()]
    print("")
    last_lemma = first_synset.lemmas()[len(first_synset.lemma_names())-1]
    #word_lemma = wn.lemma("dog.n.01.domestic_dog")
    print("Last lemmas: " + str(last_lemma))
    print("")
    print("Synset of Last lemmas: " + str(last_lemma.synset()))
    print("")
    for synset in wn.synsets(word):
        print(str(synset) + ": lemma_name" + str(synset.lemma_names()))
    print("")
    print("Lemmas of {}:".format(word))
    [print(l) for l in wn.lemmas(word)]
    print("")
    print("")
Example #35
0
def print_syn_lemmas(word):

    ## Synsets and Lemmas
    print("1. Synsets and Lemmas")
    print("Word: " + word)
    print("")
    print("Synsets:")
    [print(s) for s in wn.synsets(word)]
    print("")
    first_synset = wn.synsets(word)[0]
    print("First synset: " + str(first_synset))
    print("")

    #word_synset = wn.synset("dog.n.01")
    print("Lemma names: ")
    [print(l) for l in first_synset.lemma_names()]
    print("")
    last_lemma = first_synset.lemmas()[len(first_synset.lemma_names()) - 1]
    #word_lemma = wn.lemma("dog.n.01.domestic_dog")
    print("Last lemmas: " + str(last_lemma))
    print("")
    print("Synset of Last lemmas: " + str(last_lemma.synset()))
    print("")
    for synset in wn.synsets(word):
        print(str(synset) + ": lemma_name" + str(synset.lemma_names()))
    print("")
    print("Lemmas of {}:".format(word))
    [print(l) for l in wn.lemmas(word)]
    print("")
    print("")
Example #36
0
def get_candidates(lemma, pos) -> List[str]:
    # Part 1
    all_lemma = []
    for i in wn.lemmas(lemma, pos):
        all_lemma = all_lemma + i.synset().lemma_names()
    all_lemma = set([i.replace('_', ' ') for i in all_lemma if i != lemma])
    return all_lemma
Example #37
0
def pairs(words):
    "Find hyponym/hypernym pairs from a set of words"
    # For each word, find its most common sense
    senses = []
    sense_map = collections.defaultdict(lambda:[])
    for word in words:
        lemmas = wordnet.lemmas(word)
        nouns = [l for l in lemmas if l.synset.pos == 'n']
        counts = [l.count() for l in nouns]
        #print counts
        #print nouns
        best = max(nouns, key=lambda x:x.count())
        #print best, best.synset
        sense_map[best.synset.name].append(word)
        senses.append(best.synset)

    #print max([len(x) for x in sense_map.values()])

    #Take out "entity" as it's too general
    sense_set = {s.name for s in senses} - set(['entity.n.01'])
    for word, sense in zip(words, senses):
        hypernyms = list(sense.closure(lambda x:x.hypernyms()))
        common = {h.name for h in hypernyms} & sense_set
        if len(common) > 0:
            for c in common:
                for w in sense_map[c]:
                    print word, w
Example #38
0
def add_and_initialize_category_associations(word_synset_dict,
                                             path_to_wordnet_categories,
                                             no_mapping_count=1):
    wordnet_categories = load_as_json(path_to_wordnet_categories)
    for wordnet_category, wordnet_subcategories in wordnet_categories.items():
        for wordnet_word in wordnet_subcategories.keys():
            for lemma in wordnet.lemmas(wordnet_word):
                lemma_name = lemma.name()
                lemma_synset = lemma.synset()
                synset_name = lemma_synset.name()
                if lemma_name not in word_synset_dict:
                    word_synset_dict[lemma_name] = dict()

                if synset_name not in word_synset_dict[lemma_name]:
                    word_synset_dict[lemma_name][synset_name] = dict()
                    word_synset_dict[lemma_name][synset_name][
                        "count"] = no_mapping_count
                    word_synset_dict[lemma_name][synset_name][
                        "definition"] = lemma_synset.definition()

                word_synset_dict[lemma_name][synset_name][
                    "category"] = wordnet_category
                print(lemma_name, synset_name)
                if word_synset_dict[lemma_name][synset_name][
                        "definition"] != lemma_synset.definition():
                    print("wrong")
    return word_synset_dict
Example #39
0
def SenseNumber(lemma):
  """Get WordNet sense number for lemma."""
  i = 1
  for l in wn.lemmas(lemma.name):
    if l is lemma: return i
    if l.synset.pos == lemma.synset.pos: i = i + 1
  return 9999
Example #40
0
def generate_mono_list(lemmalist):
    print "Number of WN lemmas = "+str(len(lemmalist))
    monolist=[]
    mwcount=0
    singlesense=0
    lowfreq=0
    predcount=0
    nonpred=0
    for lemma in lemmalist[0:]:
        if notmultiword(lemma):
            #print lemma, wn.synsets(lemma,pos=wn.NOUN)
            if len(wn.synsets(lemma,pos=wn.NOUN))==1:
                freq=0
                for form in wn.lemmas(lemma,wn.NOUN):
                    freq+=form.count()
                if freq>0:
                    singlesense+=1
                    monolist.append(lemma)
                else:
                    lowfreq+=1
            else:
                total=0
                max=0
                for form in wn.lemmas(lemma,wn.NOUN):
                    thiscount=form.count()
                    #print form,thiscount
                    total+=thiscount
                    if thiscount>max:
                        max=thiscount
                if max > (total)/2:
                    predcount+=1
                    monolist.append(lemma)
                    #print "Accepting "+lemma
                else:
                    nonpred+=1

        else:
            #print "lemma "+lemma+" not added as multiword"
            mwcount+=1

    print "Accepted single sense words "+str(singlesense)
    print "Accepted predominant sense words "+str(predcount)
    print "TOTAL accepted "+str(singlesense+predcount)
    print "Rejected single sense words with no freq info "+str(lowfreq)
    print "Rejected non predominant sense words "+str(nonpred)
    print "Rejected multiwords "+str(mwcount)
    return monolist
Example #41
0
def hypernyms(word):
    hypernyms = []
    for lemma in wn.lemmas(word):
        for hypernym in lemma.synset.hypernyms():
	    for word in hypernym.lemma_names:
	        if '_' not in word:
        		hypernyms.append(word)
    return hypernyms
Example #42
0
 def _check_antonyms_existence(word, word_sets, wn_pos):
     for le in wn.lemmas(word, pos=wn_pos):
         for antonym in le.antonyms():
             name = antonym.name()
             if name in word_sets or wn.morphy(name) in word_sets or lemma(
                     name) in word_sets:
                 return True
     return False
Example #43
0
def extract_lemmas(sent):
    """Extracts all adjectives in a given sentence"""
    lemmas = []
    tokens = word_tokenize(sent)
    pos_tagged = pos_tag(tokens)
    for word in pos_tagged:
        lemmas.extend(wn.lemmas(word[0]))
    return lemmas
Example #44
0
def wordnet():

    wn.synsets('motorcar')
    wn.synset('car.n.01').lemma_names
    wn.synset('car.n.01').definition
    wn.synset('car.n.01').examples

    wn.synset('car.n.01').lemmas
    wn.lemma('car.n.01.automobile') 
    wn.lemma('car.n.01.automobile').synset
    wn.lemma('car.n.01.automobile').name

    wn.synsets('car')
    for synset in wn.synsets('car'):
        print synset.lemma_names

    wn.lemmas('car')
Example #45
0
def get_word_synsets(word, only_nouns=True):
    lemmas = wn.lemmas(word)

    #si no me da lemmas, intento algo
    if len(lemmas) == 0:
        wnl = nltk.WordNetLemmatizer()
        lemmatized_word = wnl.lemmatize(word)
        lemmas = wn.lemmas(lemmatized_word)
        if len(lemmas) == 0:
            return []

    #some distances doesn't handle not-noun words
    synsets = [lemma.synset for lemma in lemmas]

    if only_nouns:
        return [synset for synset in synsets if synset.name.split('.')[1] == 'n']
    else:
        return synsets
Example #46
0
def antonyms(word):
    """
        return all antonyms of word
    """

    lemmas = [lemma for lemma in wn.lemmas(word)]
    antonyms = set()
    for lemma in lemmas:
        for antonym in lemma.antonyms():
            antonyms.add(antonym.name)
    return antonyms
Example #47
0
def _lemmata_by_freq(query, pos):
    """Return lemmata for query, sorted descending by frequency.

    """

    lemmata = wordnet.lemmas(query, pos)
    return sorted(
        lemmata,
        key=lambda lemma: lemma.count(),
        reverse=True
    )
Example #48
0
def synonyms(word):
    synonyms = []
    for lemma in wn.lemmas(word):
        group = lemma.synset.lemma_names
        if word and (word in group):
           group.remove(word)
           group.insert(0,word)
        if len(group) > 1:
	   for synonym in group[1:]:
	       if '_' not in synonym:
	           synonyms.append(synonym)
    return synonyms
Example #49
0
def lesk_ESA_similarity(filename):
  document_similarity=Document_Relatedness()
  print "Lesk ESA"
  f = open(filename,'w')
  tree = etree.parse(path_string)
  dictionary = get_wsd_input_data(tree);
  print "Computing Lesk  Similarity  \n"
  for key in sorted(dictionary.iterkeys()):
  	word=wn.morphy(dictionary[key].lower())
	sentence=get_sentence(tree,key)
  	context_original=set([]) 

  	for w in get_context_words(sentence):
         if w.isdigit()==False:
            context_original.add(w) 
            if wn.lemmas(w)!=[]:
           	def_sentence=wn.lemmas(w)[0].synset.definition
           	for mmm in get_context_words(def_sentence):
             		context_original.add(mmm)
         
  
  	best_sense=wn.lemmas(word)[0].synset  #best sense    #most frequent sense is a default one
 	max_overlap=0
  	for lemma in wn.lemmas(word):
    		context_int=set([])
    		for example in lemma.synset.examples:        
        		for w in get_context_words(example):
            			context_int.add(w)
    		for w in get_context_words(lemma.synset.definition):   
        		context_int.add(w)
    		overlap=document_similarity.ESA_sentence_similarity(context_original,context_int)
    		if overlap>max_overlap: 
        		max_overlap=overlap
        		best_sense=lemma.synset
  	l=split_syn_dots(key)
   
  	answer_line=l[0]+" "+key+" eng-30-"+str(best_sense.offset)+"-"+best_sense.pos+"\n"
  	f.write(answer_line)
  print "Finished"
Example #50
0
def trans_verb_list():
    '''Generate a list of transitive verbs.'''
    transitive_verbs = []
    for word in wordnet.all_lemma_names('v'):
        frame_ids = set()
        for lem in wordnet.lemmas(word, 'v'):
            frame_ids.update(lem.frame_ids())
        # Verbs with these frames make sense for our sentences.
        if frame_ids.intersection({8, 9, 10, 11}):
            transitive_verbs.append(word)

    # Remove duplicates by converting to set and back in case of
    # malicious WordNet.
    return list(set(transitive_verbs))
Example #51
0
def unambiguous(words):
    "Get a list of (mainly) unambiguous terms"
    for word in words:
        lemmas = wordnet.lemmas(word)
        counts = [x.count() for x in lemmas
                  if x.synset.pos == 'n']
        if len(counts) == 0:
            continue
        m = max(counts)
        if m == 0:
            continue
        p = float(m)/sum(counts)
        if m > 2 and p > 0.8:
            print word
Example #52
0
def isantonym(word1, word2):
    """
        judge whether two word are antonym
    """

    w1_lemma = [lemma for lemma in wn.lemmas(word1)]
    w1_antonyms = set()
    for lemma in w1_lemma:
        for antonym in lemma.antonyms():
            w1_antonyms.add(antonym.name)

    if word2 in w1_antonyms:
        return True
    else:
        return False
Example #53
0
def process(sentences):
    result = []
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    for review in tokenized_sentences:
        new_review = []
        for token in review:
            new_token = regex.sub(u'', token)
            if not new_token == u'' and not new_token in stop_words:
                if len(wordnet.lemmas(new_token)) > 0:
                    new_review.append(lemmatizer.lemmatize(new_token))
            # new_token = regex.sub('', token)
            # if not new_token == '' and not new_token in stop_words:
            #     new_review.append(new_token)
        result.append(new_review)
    return result
def WORDNET(Words):
    '''Whether the Word is present in the WORDNET'''

    '''-------------------------------Feature 15------------------------------'''  
    global Featuredict
    global StopList
    for wrd in Words:
        if wrd.isalpha():
            lem=wn.lemmas(str(wrd))
            if len(lem)>0:
                    temp=wrd
                    temp=temp.lower()
                    temp=temp.rstrip()
                    temp=temp.lstrip()
                    if temp not in StopList:
                       wrd=wrd.lower() 
                       if wrd in FeatureDict:
                        FeatureDict[str(wrd)][15]=1
                       else:
                        INITFeatureDictionary(str(wrd))
                        FeatureDict[str(wrd)][15]=1 
def getsynsets(user_input):
    
        for synset in (wn.synsets(user_input)):
            print synset
            nyms = ['hypernyms', 'hyponyms', 'meronyms', 'holonyms', 'part_meronyms', 'sisterm_terms', 'troponyms', 'inherited_hypernyms']
            for i in nyms:
                try:
                    print getattr(synset, i)()
                except AttributeError as e: 
                    print e
                    pass

        for lemma in (wn.lemmas(user_input)):
            print lemma
            lemmanyms = ['antonyms', 'derivationally_related_forms', 'pertainyms']
            for y in lemmanyms:
                try:
                    print getattr(lemma, y)()
                except AttribureError as e:
                    print e
                    pass
Example #56
0
def wordnet_data(word: str) -> [str]:
    """returns a list of semi-formatted wordnet word data"""
    # definitions, parts of speech, synonyms, antonyms, related words
    result = [[], [], [], set(), [], []]
    word_info = wordnet.synsets(word)
    if len(word_info) > 0:
        word_info = word_info[0]
    else:
        return result
    lemmas = wordnet.lemmas(word)
    for synset in wordnet.synsets(word):
        result[0].append(synset.definition())
        result[1].append(synset.pos())
        # how to access part of speech?
    result[2] = word_info.lemma_names()
    for lemma in lemmas:
        for word in lemma.antonyms():
            result[3].add(word.name())
    result[4] = (word.name().split(".")[0] for word in word_info.hyponyms())
    result[5] = (word.name().split(".")[0] for word in word_info.hypernyms())
    # how to find similar words? Doesn't the corpus analysis do this? Hyponyms?
    return result
Example #57
0
def wordnet_data(word: str)->[str]:
    """returns a list of semi-formatted wordnet word data"""
    #definitions, parts of speech, synonyms, antonyms, related words
    result = [[],[],[],set(),[],[]]
    word_info = wordnet.synsets(word)
    if (len(word_info) > 0):
        word_info = word_info[0]
    else:
        return result
    lemmas = wordnet.lemmas(word)
    for synset in wordnet.synsets(word):
        #if synset.name().split('.')[0] != word:
            #continue
        result[0].append(synset.definition())
        result[1].append(synset.pos())
    result[2] = word_info.lemma_names()
    for lemma in lemmas:
        for word in lemma.antonyms():
            result[3].add(word.name())
    result[4] = (word.name().split('.')[0] for word in word_info.hyponyms())
    result[5] = (word.name().split('.')[0] for word in word_info.hypernyms())
    return result
Example #58
0
def words_of_type(word_type, min_frequency=4):
    '''
    Generate a list of words of WordNet word_type that have a total frequency of at least
    min_frequency times across all senses of the word with word_type.
    '''
    try:
        with open(word_type + '.' + str(min_frequency), 'r') as file:
            return file.read().split('\n')
    except:
        words = []
        for word in wordnet.all_lemma_names(wordnet.__getattribute__(word_type)):
            counts = [lem.count() for lem in wordnet.lemmas(word, wordnet.__getattribute__(word_type))]

            if sum(counts) >= min_frequency:
                words.append(word)

        words = [item for item in words if not item.isdigit()]

        with open(word_type + '.' + str(min_frequency), 'w') as file:
            # Remove duplicates by converting to set and back in case of
            # malicious WordNet.
            file.write('\n'.join(list(set(words))))

        return words
    def estimate(self, word):
        # find lemmas whose surface is the same as a given word
        word = word.lower()
        lemmas = wn.lemmas(word, pos=wn.NOUN)
        abstractness_list = []
        for lemma in lemmas:
            # find all the hypernyms
            tree = lemma.synset.tree(lambda s:s.hypernyms())
            hypernyms = self._flatten(tree)

            # count physical_entity and abstraction synsets
            concrete = self._count_synset(hypernyms, self.CONCRETE_NAME)
            abstract = self._count_synset(hypernyms, self.ABSTRACT_NAME)
            # abstractness = #abst / (#abst + #conc)
            if (concrete + abstract) != 0:
                abstractness = float(abstract) / (abstract + concrete)
                abstractness_list.append(abstractness)

        # take the average (0 if no sense)
        if len(abstractness_list) != 0:
            result = sum(abstractness_list) / len(abstractness_list)
        else:
            result = 0.0
        return result
Example #60
0
print "Computing Lesk  Similarity  \n"
#for key in dictionary.keys():
#print "en3.s036.t595"," " + dictionary["en3.s036.t595"]
  
for key in sorted(dictionary.iterkeys()):
  #print key," " + dictionary[key]
  word=wn.morphy(dictionary[key].lower())
  #print word,dictionary[key]
  sentence=get_sentence(tree,key)
  context_original=set([])
  #print dictionary[key]
# My first python program
  for w in get_context_words(sentence):
      if w.isdigit()==False:
         context_original.add(w) 
         if wn.lemmas(w)!=[]:
           def_sentence=wn.lemmas(w)[0].synset.definition
           for mmm in get_context_words(def_sentence):
             context_original.add(mmm)
        #   for example in wn.lemmas(w)[0].synset.examples:  adding examples decreases accuracy
        #     for w in get_context_words(example):
        #        context_original.add(w)
  #print key,word  
  best_sense=wn.lemmas(word)[0].synset  #best sense    #most frequent sense is a default one
  max_overlap=0
  for lemma in wn.lemmas(word):
    context_int=set([])
    for example in lemma.synset.examples:         #crawl through all definitions and examples
        #print "Lemma: "+lemma.name+"\n"
        #print "Example: "+ example+"\n"
        for w in get_context_words(example):