Python all_lemma_names Examples, nltk.corpus.wordnet.all_lemma_names Python Examples

Example #1

0

Show file

def _get_words():
    nltk_path = os.path.join(os.path.dirname(__file__), 'nltk_data')
    if nltk_path not in nltk.data.path:
        nltk.data.path.insert(0, nltk_path)

    verbs = sorted({
        word.lower()
        for word in filter(_filter_words, wordnet.all_lemma_names(
            wordnet.VERB))
    })
    verb_size = len(verbs)

    nouns = sorted({
        word.lower()
        for word in filter(_filter_words, wordnet.all_lemma_names(
            wordnet.NOUN))
    })
    noun_size = len(nouns)

    adjs = sorted({
        word.lower()
        for word in filter(_filter_words, wordnet.all_lemma_names(wordnet.ADJ))
    })
    adj_size = len(adjs)

    conjs = sorted([
        'and', 'or', 'lest', 'till', 'nor', 'but', 'yet', 'so', 'unless',
        'when'
    ])
    conj_size = len(conjs)

    return (verbs, verb_size, nouns, noun_size, adjs, adj_size, conjs,
            conj_size)

Example #2

0

Show file

File: similar_words.py Project: enfanterrible/false-friends

def exact_matching():
    spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)

Example #3

0

Show file

File: similar_words.py Project: enfanterrible/false-friends

def similar_matching():
    """Matches ignoring accent."""
    spa_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)

Example #4

0

Show file

 def lemma_names() -> Dict[str, List[str]]:
     return merge_lemmas(
         ("cmn", wordnet.all_lemma_names(lang="cmn")),
         (
             "qcn",
             (get_opencc().convert(l)
              for l in wordnet.all_lemma_names(lang="qcn")),
         ),
         ("qwc", wordnet.all_lemma_names(lang="qwc")),
     )

Example #5

0

Show file

File: test_wordnet.py Project: rmalouf/nltk

    def test_iterable_type_for_all_lemma_names(self):
        # Duck-test for iterables.
        # See https://stackoverflow.com/a/36230057/610569
        cat_lemmas = wn.all_lemma_names(lang='cat')
        eng_lemmas = wn.all_lemma_names(lang='eng')

        self.assertTrue(hasattr(eng_lemmas, '__iter__'))
        self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)

        self.assertTrue(hasattr(cat_lemmas, '__iter__'))
        self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)

Example #6

0

Show file

File: test_wordnet.py Project: HoaLD20/Python

    def test_iterable_type_for_all_lemma_names(self):
        # Duck-test for iterables.
        # See https://stackoverflow.com/a/36230057/610569
        cat_lemmas = wn.all_lemma_names(lang='cat')
        eng_lemmas = wn.all_lemma_names(lang='eng')

        self.assertTrue(hasattr(eng_lemmas, '__iter__'))
        self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)

        self.assertTrue(hasattr(cat_lemmas, '__iter__'))
        self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)

Example #7

0

Show file

def generate_word_graph(hyp, poly, holo, type):
    if type == 0:
        G1 = snap.TUNGraph.New()
    else:
        G1 = snap.TNGraph.New()
    hypedges = set()
    holoedges = set()
    polyedges = set()
    idToLemma = dict()
    lemmaToId = dict()
    count = 0
    for lemma_name in list(wn.all_lemma_names('n')):
        G1.AddNode(count)
        idToLemma[count] = lemma_name
        lemmaToId[lemma_name] = count
        count += 1
    for lemma_name in list(wn.all_lemma_names('n')):
        if hyp:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.hyponyms() + synset.instance_hyponyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        if type in [0, 1]:
                            G1.AddEdge(lemmaToId[lemma_name],
                                       lemmaToId[lemma_name2])
                            hypedges.add((lemmaToId[lemma_name],
                                          lemmaToId[lemma_name2]))
                        else:
                            G1.AddEdge(lemmaToId[lemma_name2],
                                       lemmaToId[lemma_name])
                            hypedges.add((lemmaToId[lemma_name2],
                                          lemmaToId[lemma_name]))
        if poly:
            for synset in wn.synsets(lemma_name, "n"):
                for lemma_name2 in synset.lemma_names():
                    lemma_name2 = lemma_name2.lower()
                    G1.AddEdge(lemmaToId[lemma_name], lemmaToId[lemma_name2])
                    polyedges.add(
                        (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
        if holo:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.member_holonyms() + synset.part_holonyms(
                ) + synset.substance_holonyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        G1.AddEdge(lemmaToId[lemma_name],
                                   lemmaToId[lemma_name2])
                        hypedges.add(
                            (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
    snap.DelSelfEdges(G1)
    return G1, idToLemma, lemmaToId, hypedges, polyedges, holoedges

Example #8

0

Show file

 def get_all_lemmas(self, replace_ws=True):
     all_wn_lemmas = list(wn.all_lemma_names())
     if replace_ws:
         all_wn_lemmas = [
             lemma.replace('_', ' ') for lemma in all_wn_lemmas
         ]
     return all_wn_lemmas

Example #9

0

Show file

File: statistics.py Project: ytsvetko/supersense_classifier

def main(argv):
  huang_vocab = LoadHuang()
  manaal_vocab = LoadManaal()
  brown_vocab = LoadBrown()

  all_lemmas = {x.lower() for x in wn.all_lemma_names(pos=wn.ADJ)}
  all_alpha_lemmas = {x for x in all_lemmas if x.isalpha()}
  all_synsets = set(wn.all_synsets(pos=wn.ADJ))
  all_alpha_synsets = {x for x in all_synsets if IsAlphaSS(x)}
  all_lemmas_with_single_synset = {x for x in all_lemmas if IsSingleSynset(x)}
  all_lemmas_ambig_synset = {x for x in all_lemmas if not IsSingleSynset(x)}
  all_lemmas_with_single_synset_alpha = {x for x in all_lemmas_with_single_synset if x.isalpha()}
  all_lemmas_ambig_synset_alpha = {x for x in all_lemmas_ambig_synset if x.isalpha()}
  all_alpha_lemmas_has_noun = {x for x in all_alpha_lemmas if LemmaHasNoun(x)}
  all_alpha_lemmas_has_noun_single_lexname = {x for x in all_alpha_lemmas_has_noun if IsNounSingleLexName(x) }
  print "all_lemmas:", len(all_lemmas)
  print "all_alpha_lemmas:", len(all_alpha_lemmas)
  print "all_synsets:", len(all_synsets)
  print "all_alpha_synsets:", len(all_alpha_synsets)
  print "all_lemmas_with_single_synset:", len(all_lemmas_with_single_synset)
  print "all_lemmas_ambig_synset:", len(all_lemmas_ambig_synset)
  print "all_lemmas_with_single_synset_alpha", len(all_lemmas_with_single_synset_alpha)
  print "all_lemmas_ambig_synset_alpha", len(all_lemmas_ambig_synset_alpha)
  print "all_alpha_lemmas_has_noun", len(all_alpha_lemmas_has_noun)
  print "all_alpha_lemmas_has_noun_single_lexname", len(all_alpha_lemmas_has_noun_single_lexname)
  print "huang.intersect(all_alpha_lemmas)", len(huang_vocab.intersection(all_alpha_lemmas))
  print "manaal.intersect(all_alpha_lemmas)", len(manaal_vocab.intersection(all_alpha_lemmas))
  print "brown.intersect(all_alpha_lemmas)", len(brown_vocab.intersection(all_alpha_lemmas))
  print "huang*manaal*brown*all_alpha_lemmas", len(huang_vocab.intersection(all_alpha_lemmas, manaal_vocab, brown_vocab))
  print "huang.intersect(all_lemmas_with_single_synset_alpha)", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "manaal.intersect(all_lemmas_with_single_synset_alpha)", len(manaal_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "brown.intersect(all_lemmas_with_single_synset_alpha)", len(brown_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "huang*manaal*brown*all_lemmas_with_single_synset_alpha", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha, manaal_vocab, brown_vocab))

Example #10

0

Show file

File: PatternGen.py Project: DanGonite57/CrPyto

def generateKnownPatterns():
    from nltk.corpus import brown, reuters, words, wordnet
    from string import ascii_lowercase as ALPH

    patterns = {}

    wordlist = sorted(
        set([x.lower()
             for x in brown.words()] + [x.lower() for x in reuters.words()] +
            [x.lower() for x in words.words()] +
            [x.lower() for x in wordnet.all_lemma_names()]))
    for word in list(wordlist):
        if any(x not in ALPH for x in word):
            wordlist.remove(word)
    with open("static/txt/wordlist.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(wordlist))

    for word in wordlist:
        p = pattern(word)
        if p in patterns:
            patterns[p].append(word)
        else:
            patterns[p] = [word]

    with open("static/txt/patterns.json", "w", encoding="utf-8") as f:
        json.dump(patterns, f)

Example #11

0

Show file

File: preProcessor.py Project: ChinthaniSugandhika/Final-Year-Research-Project

def extractConditionAttribute(nouns,attributesList):
    greaterThanList = ['greater', 'bigger', 'higher', 'great', 'more','lesser', 'smaller', 'lower', 'less']
    lesserThanList = ['lesser', 'smaller', 'lower', 'less']
    equalList = ['equal', 'equals', 'same']
    extractedWordsList=[]
    conditionAttributeList=[]
    for word in nouns:
        id=0
        if(word in greaterThanList or word in lesserThanList or word in equalList):
            id=nouns.index(word)
            attributeName=nouns[id-1]
            extractedWordsList.append(attributeName)
            nouns.remove(nouns[id])
    for word in extractedWordsList:
        if (word in attributesList):
            conditionAttributeList.append(word)
        else:
            for att in attributesList:
                lemmas = set(wordnet.all_lemma_names())
                if (att in lemmas and word in lemmas):
                    #print("true")
                    s1 = wn.synsets(att)[0]
                    s2 = wn.synsets(word)[0]
                    sim = s1.wup_similarity(s2)
                    if sim >= 0.8:
                        conditionAttributeList.append(att)
    return conditionAttributeList

Example #12

0

Show file

File: wordnet_eval.py Project: sabirdvd/torch_language_model

def get_antonyms(remove_duplicates=False):
    antonyms = []
    lemmas_with_antonyms = []
    lemma_antonym_pairs = []
    for l in wn.all_lemma_names():
        synsets = wn.synsets(l)
        for s in synsets:
            for s_l in s.lemmas():
                if s_l.name() != l:
                    # Here I only care about the antonyms of the current lemma
                    continue

                found_antonym = False
                for a in s_l.antonyms():
                    antonyms.append(a)
                    found_antonym = True

                    if remove_duplicates:
                        lemma_antonym_pairs.append(frozenset((s_l, a)))
                    else:
                        lemma_antonym_pairs.append((s_l, a))

                if found_antonym:
                    lemmas_with_antonyms.append(s_l)

    if remove_duplicates:
        lemma_antonym_pairs = list(set(lemma_antonym_pairs))

    return antonyms, lemmas_with_antonyms, lemma_antonym_pairs

Example #13

0

Show file

 def extract(self, blabla, max_pick_word):
     """
         Extract wordnet nouns (or proper noun) from a blabla, which are not on the memory, nor on the selGraph, nor in EXCLUDED
         #TODO: TAKE ALSO WIKIPEDIA STILL
         Self Quest bounded to a maximum of max_pick_word to avoid too long wait. Beware, of found wikipediable word!
         Beware of upper or lower letters which can create conflicts.
         #TODO: Test Edge Cases and memor
     """
     OKWordnet = []
     wn_lemmas = set(
         wordnet.all_lemma_names())  #TODO: SHALL LOAD IT ONLY ONCE???
     if len(blabla) == 0:  #empty
         self.log.info("No new words to grow from.")
     else:
         counter = 0  #count words added
         for word, pos in nltk.pos_tag(word_tokenize(blabla)):
             if counter < max_pick_word:  #Stop once has enough words
                 if pos in ['NN', 'NNS', 'NNP']:
                     if not word.isupper(
                     ):  #To avoid turning words like AI lower case. Else turn to lower case. Ex: donald_trump
                         word = word.lower()
                     #TODO: Need Lemmatizer to avoid words which have same roots?
                     if ((word in wn_lemmas) or
                         (wikipedia.page(word).exists())
                         ) and not (word in OKWordnet):
                         if word in self.graph.keys(
                         ):  #Word is there, augment its weight.
                             self.graph[word][0] = self.graph[word][0] * 1.1
                         else:  #TODO: Shall exclude memory ?
                             OKWordnet.append(word)
                             counter += 1
         #Special case of duo words for wikipedia, such as global_warming https://en.wikipedia.org/wiki/Global_warming
         #TODO: FOR THESE, use wikipedia!?
         wordList = blabla.split(
         )  #then need word.strip(string.punctuation)
         token_list = nltk.pos_tag(word_tokenize(blabla))
         counter = 0
         for token1, token2 in zip(token_list,
                                   token_list[1:]):  #Consecutive token
             word1, pos1 = token1
             word2, token2 = token2
             if counter < max_pick_word and len(word1) > 1 and len(
                     word2) > 1 and (word1 not in EXCLUDED) and (
                         word2
                         not in EXCLUDED):  #Stop once has enough words
                 if not word1.isupper(
                 ):  #lower letter unless fully upper letter:check for proper noun
                     word1 = word1.lower()
                 if not word2.isupper(
                 ):  #lower letter unless fully upper letter
                     word2 = word2.lower()
                 duo = word1 + " " + word2
                 if wikipedia.page(duo).exists() and not (duo in OKWordnet):
                     if duo in self.graph.keys():
                         self.graph[duo][0] = self.graph[duo][0] * 1.1
                     else:
                         OKWordnet.append(duo)
                         counter += 1
         self.log.info("New words to learn from")  #+OKWordnet)
     return OKWordnet

Example #14

0

Show file

def init_dictionary():
    # Note: I also tried adding the words from /usr/share/dict/web,
    # but the only additional words it had that were not already in
    # the next two dicts were people's proper names. Not useful.
    global DICTIONARY
    DICTIONARY = set(nltk_words.words())
    DICTIONARY.update(nltk_wordnet.all_lemma_names())

Example #15

0

Show file

File: main.py Project: Rolling-mxb/TextAnalysis

 def gloss(self, word):
     wn_lemmas = set(wn.all_lemma_names())
     if word in wn_lemmas:
         syn = wn.synsets(word)[0]
         return syn.definition()
     else:
         return None

Example #16

0

Show file

File: ch2.py Project: jyzhang/py-nlp

def polysemy_analysis():
	"""returns the average polysemy (number of senses) for nouns, verbs, adjectives, and adverbs in wordnet"""
	conditions = (wn.NOUN, wn.VERB, wn.ADJ, wn.ADV)
	cfd = nltk.ConditionalFreqDist((pos, len(wn.synsets(lemma_name, pos)))
		for pos in conditions
		for lemma_name in wn.all_lemma_names(pos))
	for pos in cfd.conditions():
		print "{0:2s} {1:10f}".format(pos, sum([item[0] * item[1] for item in cfd[pos].items()]) * 1.0 / cfd[pos].N())

Example #17

0

Show file

File: mambati_nltkChapter2.py Project: mohanambati/Advance-NLP

def polysemy_meth2(word_class):
    print("Computing average polysemy of word class(" + word_class +
          ") .........")
    polysemy_count = 0
    all_lemma_names = set(wordnet.all_lemma_names(word_class))
    for name in all_lemma_names:
        polysemy_count += len(wordnet.synsets(name, word_class))
    return polysemy_count / len(all_lemma_names)

Example #18

0

Show file

File: bajsps-ewj-topics-extract.py Project: farha05/mining-py

def filterTokensUsingNltkWordnet():
    # for k in wordnet.all_lemma_names():
    #     print(k)
    all_lemma_list = [k for k in wordnet.all_lemma_names()]
    
    tokset = set(freqtoklistdic["all"])
    wntset = set(all_lemma_list)
    iset = tokset.intersection(wntset)
    inwordnet = len(iset)

Example #19

0

Show file

def synset_senses(text):
    synsets = []
    wn_lemmas = set(wn.all_lemma_names())
    for word, tag in text:
        lemma = lemmatizer.lemmatize(word, tag)
        if lemma in wn_lemmas:
            words = (list(swn.senti_synsets(word))[0])
            synsets.append(words)
    return synsets

Example #20

0

Show file

File: datagen.py Project: DableUTeeF/dictionaries

 def all(self):
     thai_wn_words = []
     thai_wn_lemmas = [x for x in wn.all_lemma_names(lang='tha')]
     for word in thai_wn_lemmas:
         meanings = wn.synsets(word, lang='tha')
         word = word.replace('_', ' ')
         for meaning in meanings:
             thai_wn_words.append((word, meaning.lemma_names()[0]))
     indices = list(range(len(thai_wn_words)))
     return thai_wn_words, indices

Example #21

0

Show file

def print_sentence(word):
    if word in word_in_sentence:
        if word in wn.all_lemma_names():
            return (wn.synsets(word)[0].examples())
        else:
            msg = ["cannot find example sentence for this word"]
            return msg
    else:
        msg = ["cannot find example sentence for this word"]
        return msg

Example #22

0

Show file

    def __init__(self, exwordnet, name, pos, lang='eng'):
        self._exwordnet = exwordnet
        if name not in wn.all_lemma_names(pos=pos, lang=lang):
            raise exWordNetError('word %s.%s in %s is not defined in WordNet' %
                                 (name, pos, lang))

        self._name = name
        self._pos = pos
        self._lang = lang
        self._key = '%s.%s.%s' % (name, pos, lang)

Example #23

0

Show file

    def all_words(self, pos=None, lang='eng'):
        if pos is None:
            pos_tags = ['n', 'v', 'a', 'r']
        else:
            pos_tags = [pos]

        for pos_tag in pos_tags:
            for l in wn.all_lemma_names(pos=pos_tag, lang=lang):
                word = Word(self, l, pos_tag, lang=lang)
                yield word

Example #24

0

Show file

def generate_word_graph(hyp, poly, holo):
    G1 = nx.Graph()
    hypedges = set()
    holoedges = set()
    polyedges = set()
    idToLemma = dict()
    lemmaToId = dict()
    count = 0
    for lemma_name in list(wn.all_lemma_names('n')):
        G1.add_node(count)
        idToLemma[count] = lemma_name
        lemmaToId[lemma_name] = count
        count += 1
    for lemma_name in list(wn.all_lemma_names('n')):
        if hyp:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.hyponyms() + synset.instance_hyponyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        G1.add_edge(lemmaToId[lemma_name],
                                    lemmaToId[lemma_name2])
                        hypedges.add(
                            (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
        if poly:
            for synset in wn.synsets(lemma_name, "n"):
                for lemma_name2 in synset.lemma_names():
                    lemma_name2 = lemma_name2.lower()
                    G1.add_edge(lemmaToId[lemma_name], lemmaToId[lemma_name2])
                    polyedges.add(
                        (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
        if holo:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.member_holonyms() + synset.part_holonyms(
                ) + synset.substance_holonyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        G1.add_edge(lemmaToId[lemma_name],
                                    lemmaToId[lemma_name2])
                        hypedges.add(
                            (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
    G1.remove_edges_from(G1.selfloop_edges())
    return G1, idToLemma, lemmaToId, hypedges, polyedges, holoedges

Example #25

0

Show file

File: single_pos.py Project: sampablokuper/dwgen

def process():
    # Install Open Multilingual Wordnet if not already installed.
    nltkd = nltk.downloader.Downloader()
    if not nltkd.is_installed('omw'):
        nltk.download('omw')

    # Figure out ISO 639-2 code for specified locale. Exit if unavailable.
    print args.language
    iso639_2 = langcodes.best_match(args.language, wn.langs())[0]
    print iso639_2
    print wn.langs()
    if iso639_2 == 'und': # Nearest ISO 639-2 code is undefined.
        exit("Requested language is not available on this NLTK Wordnet installation.")

    # Obtain set of lowercased lemmas that belong to only one part of speech.
    posdb = dict()
    single_pos_lemmas = set()
    for pos in ['a', 'r', 'n', 'v']:
        posdb[pos] = set()
        # Note: wn.all_lemma_names() returns the lemma names in all lowercase.
        # To remove lemmas that are sometimes or always capitalised in normal
        # writing (e.g. "China" or "Arulo"), we will need to obtain capitalised
        # lemmas from Wordnet later on, and remove members of our output set
        # that are identical to the lowercased transformation of those
        # capitalised lemmas.
        for lemma in wn.all_lemma_names(pos=pos, lang=iso639_2):
            posdb[pos].add(lemma)
        single_pos_lemmas.symmetric_difference_update(posdb[pos])

    # Remove lemmas containing characters other than a-z.
    output_set = set()
    for term in single_pos_lemmas:
        if non_word.search(term) != None:
            continue
        output_set.add(term)

    # Obtain a set of lemmas that are typically capitalised in normal writing.
    unlowered_lemmas = set()
    for synset in list(wn.all_synsets()):
        for lemma in synset.lemma_names():
            unlowered_lemmas.add(lemma)
    for word in output_set:
        lemmas =
    # Filter inspiration: http://stackoverflow.com/a/16562558
    output_set = filter(lambda x:len(x) > 4 and len(x) < 7, output_set)
    names_lowered = set()
    for name in nltk.corpus.names.words():
        names_lowered.add(name.lower())
    output_set = filter(lambda x: x not in names_lowered, output_set)
    print output_set
    # print single_pos_lemmas
    print len(single_pos_lemmas)
    print len(output_set)

Example #26

0

Show file

    def extractWordsAndSynsets(self, filenameWords, filenameSynsets,  filenameLexemes):
        #file
        fWords = codecs.open(filenameWords, 'w', 'utf-8')
        fSynsets = codecs.open(filenameSynsets, 'w',  'utf-8')
        fLexemes = codecs.open(filenameLexemes, 'w',  'utf-8')

        wordCounter = 0
        wordCounterAll = 0
        synsetCounter = 0
        synsetCounterAll = 0
        lexemCounter = 0
        lexemCounterAll = 0

        ovv = []

        for pos in self.pos_list:
            for word in wn.all_lemma_names(pos=pos, lang=self.lang):
                wordCounterAll += 1
                self.WordIndex[word] = wordCounterAll
                fWords.write(word+" ")
                synsetInWord = 0
                for synset in wn.synsets(word, lang=self.lang):
                    lexemCounterAll += 1
                    synsetId = synset.name()
                    if self.Shared.in_vocab(synsetId):
                        synsetInWord += 1
                        if synsetId not in self.SynsetIndex:
                            fSynsets.write(synsetId + " " + self.Shared.getVectorAsString(self.Shared.model[synsetId]) + "\n")
                            synsetCounter += 1
                            self.SynsetIndex[synsetId] = synsetCounter

                        lexemCounter += 1
                        #lemma name
                        sensekey = wn.lemma(synset.name()+'.'+word).key()

                        fWords.write(sensekey + ",")
                        fLexemes.write(str(self.SynsetIndex[synsetId]) + " " + str(wordCounterAll) + "\n")
                    else:
                        ovv.append(synsetId)


                fWords.write("\n")
                if synsetInWord is not 0:
                    wordCounter += 1
                else:
                    self.WordIndex[word] = -1
        fWords.close()
        fSynsets.close()
        fLexemes.close()
        print("   Words: %d / %d\n" % (wordCounter, wordCounterAll))
        print("  Synset: %d / %d\n" % (synsetCounter, synsetCounter + len(ovv)))
        print("  Lexems: %d / %d\n" % (lexemCounter, lexemCounterAll))

Example #27

0

Show file

File: wordnet.py Project: lungj/passphrase_generator

def trans_verb_list():
    '''Generate a list of transitive verbs.'''
    transitive_verbs = []
    for word in wordnet.all_lemma_names('v'):
        frame_ids = set()
        for lem in wordnet.lemmas(word, 'v'):
            frame_ids.update(lem.frame_ids())
        # Verbs with these frames make sense for our sentences.
        if frame_ids.intersection({8, 9, 10, 11}):
            transitive_verbs.append(word)

    # Remove duplicates by converting to set and back in case of
    # malicious WordNet.
    return list(set(transitive_verbs))

Example #28

0

Show file

def retrieve_unambiguous(extraction_dictionary,
                         with_example=True,
                         keep_mwe=True):
    """
    read unambiguous definitions from nltk wordnet
    """
    lemma_data = set()
    if keep_mwe:
        lemma_data = {(lm.lower(), lg)
                      for lg in extraction_dictionary
                      for p in extraction_dictionary[lg]
                      for lm in wn.all_lemma_names(lang=lg, pos=p)}
    else:
        lemma_data = {(lm.lower(), lg)
                      for lg in extraction_dictionary
                      for p in extraction_dictionary[lg]
                      for lm in wn.all_lemma_names(lang=lg, pos=p)
                      if "_" not in lm}
    for lemma, lang in lemma_data:
        synsets = wn.synsets(lemma)
        if len(synsets) == 1:
            synset = synsets[0]
            if with_example:
                examples = {
                    ex
                    for ex in synset.examples()
                    if lemma_name in set(ex.split())
                }
                if not len(examples):
                    yield lemma, lang, synset.pos(), synset.definition(), ""
                else:
                    for example in examples:
                        yield lemma, lang, synset.pos(), synset.definition(
                        ), example
            else:
                yield lemma, lang, synset.pos(), synset.definition()

Example #29

0

Show file

File: utils.py Project: frankier/lextract

def get_rev_map(lang):
    if lang not in WORDNET_FILTERS:
        return lambda x: x
    filter = WORDNET_FILTERS[lang]

    def rev_map(x):
        return _rev_maps[lang].get(x, x)

    if lang not in _rev_maps:
        m = {}
        for lemma in wordnet.all_lemma_names(lang=lang):
            filtered_lemma = filter(lemma)
            m[filtered_lemma] = lemma
        _rev_maps[lang] = m
    return rev_map

Example #30

0

Show file

File: search_labels.py Project: htemizkan/learning_unknown_objects_from_the_web

 def _is_object(self, word):
     wn_lemmas = set(wordnet.all_lemma_names())
     lemmatizer = WordNetLemmatizer()
     word = lemmatizer.lemmatize(word)
     str_var = "".join(word + ".n.01")
     if word in wn_lemmas and str_var in [
             syn.name() for syn in wordnet.synsets(word)
     ]:
         w = wordnet.synset(str_var)
         synset_obj = wordnet.synset("object.n.01")
         if synset_obj.wup_similarity(w) >= OBJECT_SIMILARITY:
             return True
         else:
             return False
     return False

Example #31

0

Show file

File: average_polysemy.py Project: MaxEntGirl/Prag

def average_polysemy(part_of_speech):

    # 1. aggregate all lemmas in wordnet that have the given POS (hint: use nltk.corpus.wordnet.all_lemma_names)
    # 2. sum the number of meanings of each lemma (restricted to the given POS)
    # 3. return the average polysemy of a given POS

    all_lemmas = set(wn.all_lemma_names(part_of_speech))

    #print(len(all_lemmas))

    meanings_length = 0
    for lemma in all_lemmas:
        meanings = wn.synsets(lemma, part_of_speech)
        meanings_length = meanings_length + len(meanings)

    return meanings_length / len(all_lemmas)

Example #32

0

Show file

    def filterWN(self):
        '''
        From self.words takes the words that are in WordNet and save them in
        self.filtered_words.
        :return: None
        '''
        self.logger.info('Starting filtrated with WordNet')

        wn_lemmas = set(wn.all_lemma_names())
        for j in self.words:
            if j in wn_lemmas:
                self.filtered_words.append(j)

        self.filtered_words = list(set(self.filtered_words))

        self.logger.info('Finished filtrated with WordNet')

Example #33

0

Show file

File: lex_dec_vars.py Project: awjamison/SufAmb

 def synset_ratio(self):
     new_var = "vtn_synsets"
     wn_lemmas = [l.encode('ascii') for l in wn.all_lemma_names()]
     has_item = self.compare_items(wn_lemmas)
     new_column = []
     if False in has_item:
         self._warning_msg('synset_ratio', wn_lemmas)
     for record, exists in zip(self._dict, has_item):
         if exists:
             n_synsets = len(wn.synsets(record, 'n'))
             v_synsets = len(wn.synsets(record, 'v'))
             vtn_synsets = float(v_synsets) / (n_synsets + v_synsets)
         else:
             vtn_synsets = None
         new_column.append(vtn_synsets)
     self._append_column(new_column, new_var)

Example #34

0

Show file

File: tarun.py Project: mohanambati/Advance-NLP

def exercise4():
    words = [n for n in wn.all_lemma_names()]
    more_than_one = []
    for word in words:

        tags = []
        for synset in wn.synsets(word):
            dot_location = synset.name().find('.')
            if synset.name()[0:dot_location] == word:
                tags.append(synset.name()[dot_location + 1:dot_location + 2])
        if len((set(tags))) > 1:
            more_than_one.append(word)

    print("More than one tagged words:", len(more_than_one))
    print("Total:", len(words))
    print("Percentage is:", (len(more_than_one) / len(words)) * 100)

Example #35

0

Show file

File: 01_parse.py Project: prisilamichelle/DSL

def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    msg.text("Preprocessing text...")
    texts = [line.rstrip() for line in open(in_file, 'r')]
    docs = nlp.pipe(texts, n_process=n_process)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    wn_lemmas = set(wordnet.all_lemma_names())
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            # print(doc)
            spans = get_phrases(doc, wn_lemmas)
            spans = filter_spans(spans)
            # print('NOUN SPAN', str(spans))
            doc = merge_phrases(doc, spans)
            spans = get_adjective_phrases(doc)
            spans = filter_spans(spans)
            # print('ADJ SPAN', str(spans))
            # print('*-----------------------------------------*')
            doc = merge_phrases(doc, spans)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )

Example #36

0

Show file

File: generate_data.py Project: hx364/Synonym_Extraction

def generate_irrelevant(file_name='irrelevent_min.txt'):
    #generate irrelevant word pairs, will be used as negative label
    all_lemma = [i for i in wn.all_lemma_names() if '_' not in i]
    length = len(all_lemma)
    count = 0
    f = open(file_name, 'wb')
    for i in wn.all_synsets():
        m = len(i.lemma_names())/3+1
        for j in range(m):
            current_word = i.lemma_names()[random.randint(0, len(i.lemma_names())-1)]
            if '_' not in current_word:
                for j in range(2):
                    word = all_lemma[random.randint(0,length-10)]
                    if word not in i.lemma_names():
                        count+=1
                        f.write(current_word+', '+word+'\n')
    print count
    f.close()

Example #37

0

Show file

File: lex_dec_vars.py Project: awjamison/SufAmb

 def wordnet_synsets(self):
     """Number of WordNet "synsets" (roughly, senses) for the given word.
     This variable collapses across different parts of speech, meanings,
     etc.
     """
     new_var = "n_synsets"
     wn_lemmas = [l.encode('ascii') for l in wn.all_lemma_names()]
     has_item = self.compare_items(wn_lemmas)
     new_column = []
     if False in has_item:
         self._warning_msg('wordnet_synsets', wn_lemmas)
     for record, exists in zip(self._dict, has_item):
         if exists:
             n_synsets = len(wn.synsets(record))
         else:
             n_synsets = None
         new_column.append(n_synsets)
     self._append_column(new_column, new_var)

Example #38

0

Show file

File: words.py Project: daoudclarke/learncone

def monosemous():
    all_lemmas = list(wn.all_lemma_names())
    print "First ten lemmas: ", all_lemmas[:10]
    print "All lemmas: ", len(list(all_lemmas))

    # Only interested in single words
    single_word_lemmas = [x for x in all_lemmas if not '_' in x]
    print "First ten single word lemmas: ", single_word_lemmas[:10]
    print "Single word lemmas: ", len(single_word_lemmas)

    # Find monosemous
    monosemous = [x for x in single_word_lemmas if len(wn.synsets(x)) == 1]
    print monosemous[:10]
    senses = [(x, wn.synsets(x)[0]) for x in monosemous]
    noun_senses = [x for x in senses if x[1].pos == 'n']
    print "First ten monosemous nouns: ", noun_senses[:10]
    print "Number of monosemous nouns: ", len(noun_senses)
    return noun_senses

Example #39

0

Show file

File: list_abbrvs.py Project: frankier/finntk

def main(output):
    pronouns = {}
    for lemma in wordnet.all_lemma_names(lang="fin"):
        if "_" not in lemma:
            continue
        for b in lemma.split("_"):
            if b not in ALL_ABBRVS:
                continue
            pronouns.setdefault(b, []).append(lemma)
    if output == "group":
        for p, v in sorted(pronouns.items()):
            print(p, v)
    elif output == "abbrv":
        for p, v in sorted(pronouns.items()):
            print(p)
    elif output == "pron":
        for v in sorted((v for lemma in pronouns.values() for v in lemma)):
            print(v)

Example #40

0

Show file

File: mainCordinator.py Project: ChinthaniSugandhika/SQL_Query_Generation_Engine

def getTableName(word, tables):
    tableList = ''
    for table in tables:
        if word == table:
            tableList = table
        else:
            s1Lemmas = set(wordnet.all_lemma_names())
            if table in s1Lemmas and word in s1Lemmas:
                s1 = wn.synsets(table)[0]
                s2 = wn.synsets(word)[0]
                sim = s1.wup_similarity(s2)
                if sim is None:
                    return
                elif sim >= 0.8:
                    tableList = table
                else:
                    return

    return tableList

Example #41

0

Show file

File: wordnet.py Project: lungj/passphrase_generator

def words_of_type(word_type, min_frequency=4):
    '''
    Generate a list of words of WordNet word_type that have a total frequency of at least
    min_frequency times across all senses of the word with word_type.
    '''
    try:
        with open(word_type + '.' + str(min_frequency), 'r') as file:
            return file.read().split('\n')
    except:
        words = []
        for word in wordnet.all_lemma_names(wordnet.__getattribute__(word_type)):
            counts = [lem.count() for lem in wordnet.lemmas(word, wordnet.__getattribute__(word_type))]

            if sum(counts) >= min_frequency:
                words.append(word)

        words = [item for item in words if not item.isdigit()]

        with open(word_type + '.' + str(min_frequency), 'w') as file:
            # Remove duplicates by converting to set and back in case of
            # malicious WordNet.
            file.write('\n'.join(list(set(words))))

        return words

Example #42

0

Show file

File: train.py Project: vialab/semantic-guesser

def noun_vocab(tcm=None, postagger=None, min_length=0):
    """
    Return all nouns found in wordnet in both singular and plurar forms,
    along with POS tag and synset (as given by a TreeCutModel instance).
    """
    if not postagger:
        postagger = BackoffTagger()

    getpostag = lambda word : postagger.tag([word])[0][1]
    singular_n_pos = getpostag("house")
    plural_n_pos   = getpostag("houses")

    nouns = set()

    for lemma in wn.all_lemma_names(pos = 'n'):
        if len(lemma) < min_length:
            continue
        if '_' in lemma:
            continue

        plural = None
        if lemma[-1] != 's':
            plural = pluralize(lemma)
            # use the the plural only if it still enable us to
            # get to the synsets (some words shouldn't be pluralized)
            if len(wn.synsets(plural)) == 0:
                plural = None

        for syn in wn.synsets(lemma, 'n'):
            classes = tcm.predict(syn) if tcm is not None else [syn.name()]
            for classy in classes:
                nouns.add((lemma, singular_n_pos, classy))
                if plural is not None:
                    nouns.add((plural, plural_n_pos, classy))

    return nouns

Example #43

0

Show file

File: wn_diversity.py Project: cikusa/nltk_exercises

def diversity(pos):
    lemmas = list(wordnet.all_lemma_names(pos=pos))
    count = 0
    for word in lemmas:
        count = count + len(wordnet.synsets(word, pos))
    return count / len(lemmas)

Example #44

0

Show file

File: 1231_singer_singing_group.py Project: boisvert42/npr-puzzle-python

#!/usr/bin/env python
"""
NPR 2017-12-31
https://www.npr.org/2017/12/31/574287856/sunday-puzzle-new-names-in-the-news

Name a famous singer — 3 letters in the first name, 5 letters in the last. 
Drop the middle letter of the last name and rearrange the result to name a 
variety of singing group. What is it?
"""
#%%
import sys
sys.path.append('..')

from nprcommontools import get_famous_names, sort_string
from nltk.corpus import wordnet as wn

singing_groups = dict()
for x in wn.all_lemma_names():
    ss = sort_string(x.replace('_','').lower())
    if len(ss) == 7:
        singing_groups[ss] = singing_groups.get(ss,[]) + [x]

names = [x for x in get_famous_names(80).keys() if x.count(' ') == 1]
names = [x for x in names if len(x.split(' ')[0])==3 and len(x.split(' ')[1])==5]

for name in names:
    fn,ln = name.lower().split(' ')
    ln = ln[:2] + ln[-2:]
    if sort_string(fn+ln) in singing_groups:
        print name, singing_groups[sort_string(fn+ln)]

Example #45

0

Show file

File: 0827_eight_letter_supervocalic.py Project: boisvert42/npr-puzzle-python

#!/usr/bin/env python
"""
NPR 2017-08-27
http://www.npr.org/2017/08/27/545580069/sunday-puzzle-categorically-speaking
This week's challenge is a common two-word expression. 
The expression consists of 8 letters and uses all five vowels — A, E, I, O and U. 
It has only three consonants, one of which is repeated. 
The first word in the expression has two letters and the second has six letters. 
What familiar expression is it?
"""

from nltk.corpus import wordnet as wn

#%%
for word in wn.all_lemma_names():
    if len(set(word).intersection(set('aeiou'))) == 5 and len(word) == 9 and word[2] == '_':
        print word

Example #46

0

Show file

File: semblance.py Project: hobson/semilar

    element_type = "lemma"
    name = String(nullable=False)

g = Graph()
g.add_proxy("lemma", Lemma)


# >>> g.add_proxy("knows", Knows)
# >>> james = g.people.create(name="James")
# >>> julie = g.people.create(name="Julie")
# >>> g.knows.create(james, julie)

import progressbar as pb

N = 150000
for N, ln in enumerate(wn.all_lemma_names()):
    pass

widgets = [pb.Counter(), '%d rows: ' % N, pb.Percentage(), ' ', pb.RotatingMarker(), ' ', pb.Bar(),' ', pb.ETA()]
pbar = pb.ProgressBar(widgets=widgets, maxval=N).start()

for i, ln in enumerate(wn.all_lemma_names()):
    pbar.update(i)
    lemma = g.lemma.create(name=str(ln))
    #Lemma(ln).save()
pb.finish()

# class Knows(Relationship):

#     label = "knows"

Example #47

0

Show file

File: similarity.py Project: gsi-upm/sematch

 def get_all_lemma_names(self):
     return wn.all_lemma_names('n')

Example #48

0

Show file

File: 0911_seven_e.py Project: boisvert42/npr-puzzle-python

'''
NPR Puzzle 2016-09-11

www.npr.org/2016/09/11/493408422/its-a-race-to-the-end-of-the-alphabet

Think of a well-known category with exactly seven things in it. 
Alphabetize the things from their ending letters, and the last 
letter alphabetically will be "e." In other words, no thing in 
this category ends in a letter after "e" in the alphabet. 
It's a category and set of seven things that everyone knows. 
What is it?
'''
import sys
sys.path.append('..')
from nprcommontools import get_category_members
from nltk.corpus import wordnet as wn
#%%
def last_letter_alphabetically(l):
    let = ''
    for x in l:
        if x[-1] > let:
            let = x
    return let

#%%
possible_categories = wn.all_lemma_names()
for cat in possible_categories:
    cat_members = get_category_members(cat)
    if len(cat_members) == 7 and last_letter_alphabetically(cat_members) == 'e':
        print cat_members

Example #49

0

Show file

File: gen_all_lemmas.py Project: encs-humanoid/ai

#!/usr/bin/python
#===================================================================
# This codelet extracts all lemmas from WordNet by looking up
# the synsets for each lemma name and collecting the associated
# lemmas into a set.  Lemmas with a dot in the name are excluded.
# Copyright 2014, IEEE ENCS Humanoid Robot Project
#===================================================================

from __future__ import division
from nltk.corpus import wordnet as wn

lemmas = set()
for lemma_name in wn.all_lemma_names():
	for synset in wn.synsets(lemma_name):
		for lemma in synset.lemmas():
			if lemma.name().find('.') < 0: # exlude lemmas with a dot in the name, because WN cannot parse them
				lemmas.add(synset.name() + '.' + lemma.name())

with open('all_lemmas.txt', 'w') as f:
	for lemma in lemmas:
		f.write(lemma + "\n")

Example #50

0

Show file

File: 0910_jab_rhyme.py Project: boisvert42/npr-puzzle-python

#!/usr/bin/env python
"""
NPR 2017-09-10

Think of a famous quotation with 8 words. 
The initial letters of the first 4 words themselves spell a word, 
and the initial letters of the last 4 words spell another word. 
Both words rhyme with "jab." What quotation is it?
"""

import sys
sys.path.append('..')
import rhyme
from nltk.corpus import brown, wordnet as wn

brown_words = frozenset(brown.words())

#%%
# Check out words that rhyme with "jab"
rhymes_with_jab = [x for x in rhyme.all_rhymes('jab') if len(x) == 4]

# Last word starts and ends with "b", second-to-last with "a"
# (unless "sabb", "tabb", and "nabb" are common words)
b_words = [x for x in wn.all_lemma_names() if x.startswith('b') and x.endswith('b')  and x.isalpha()]
#print [x for x in brown_words if x.startswith('a') and x.endswith('a')]

for w in b_words:
    print [x for x in wn.all_lemma_names() if x.endswith('_{0}'.format(w)) and x.count('_') == 3]

Example #51

0

Show file

File: wordnet.py Project: zkan/pythainlp

def all_lemma_names(pos=None, lang="tha"):
	return wordnet.all_lemma_names(pos=pos, lang=lang)

Example #52

0

Show file

File: stats.py Project: ytsvetko/adjective_supersense_classifier

import collections
from operator import itemgetter
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor

wn_lemmas = set()
for lemma in wn.all_lemma_names(pos=wn.ADJ):
  wn_lemmas.add(lemma)

wn_adj_synsets = collections.defaultdict(set)

for word in wn_lemmas:
  for synset in wn.synsets(word, wn.ADJ):
    wn_adj_synsets[synset.name.lower()] = [lemma.lower() for lemma in synset.lemma_names ]

semcor_adjectives = set()
i = 0
for sent in semcor.tagged_sents(tag='both'):
  for c,chk in enumerate(sent):
    if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit() and chk[0].node.startswith('JJ'):
      if len(chk.leaves()) == 1:
        semcor_adjectives.add(chk.leaves()[0].lower())


semcor_synsets = set()
for s, words in wn_adj_synsets.items():
  for w in words:
    if w in semcor_adjectives:
      semcor_synsets.add(s.lower())

vectors = set()

Example #53

0

Show file

File: omw_all_lemma_names_function_test.py Project: sampablokuper/dwgen

import nltk
from tabulate import tabulate
# Install Open Multilingual Wordnet and Wordnet
# if not already installed.
nltkd = nltk.downloader.Downloader()
for corpus in ['wordnet','omw']:
    if not nltkd.is_installed(corpus):
        nltk.download(corpus)

from nltk.corpus import wordnet as wn

table = list()

for lang in sorted(wn.langs()):
    my_set_of_all_lemma_names = set()
    from nltk.corpus import wordnet as wn
    for aln_term in list(wn.all_lemma_names(lang=lang)):
        for synset in wn.synsets(aln_term):
            for lemma in synset.lemma_names():
                my_set_of_all_lemma_names.add(lemma)
    table.append([lang,
        len(set(wn.all_lemma_names(lang=lang))),
        len(my_set_of_all_lemma_names)])

print tabulate(table,
    headers=["Language code",
        "all_lemma_names()",
        "lemma_name.synset.lemma.lemma_names()"])

Example #54

0

Show file

File: 0729_mxxx_xxxx.py Project: boisvert42/npr-puzzle-python

Think of a familiar two-word phrase in 8 letters — with 4 letters in each word. 
The first word starts with M. Move the first letter of the second word to the 
end and you'll get a regular 8-letter word, which, amazingly, other than the M, 
doesn't share any sounds with the original two-word phrase. What phrase is it?
'''
import sys
sys.path.append('..')
import nprcommontools as nct
import itertools
import requests
        
from nltk.corpus import wordnet as wn
from nltk.corpus import cmudict

#%%
words = set(x for x in wn.all_lemma_names() if x.count('_') == 0 and x.startswith('m'))
m_phrases = set()
url = 'http://www.codon.org.uk/~mjg59/tmp/wordlists/english_phrases.txt'
r = requests.get(url)
for line in r.content.split('\n'):
    line = line.strip()
    if line.count(' ') == 1 and line.split(' ')[0].startswith('m') and len(line.split(' ')[0]) == 4 and len(line.split(' ')[1]) == 4:
        m_phrases.add(line)

#%%
for m in m_phrases:
    m1,m2 = m.split(' ')
    new_word = m1 + m2[1:] + m2[0]
    if new_word in words:
        print m,new_word

Example #55

0

Show file

File: similar_words.py Project: pln-fing-udelar/false-friends

def baseline():
    spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    return [x for x in spa_words & por_words if
            set(wn.synsets(x, lang="spa")) & set(wn.synsets(x, lang="por")) == set()]

Example #56

0

Show file

File: bajsps-ewj-analyse.py Project: farha05/mining-py

from nltk.corpus import wordnet
import nltk.util
print("NLTK import end")
import logging
# from gensim import corpora, models, similarities

# gensimoutdir = "/XXXprojects/cch/foresight/dat/gensim"

print(words.fileids())
for fid in words.fileids():
    print(fid, len(words.words(fid)))
    print(words.words(fid)[:5], "...", words.words(fid)[-5:])

# for k in wordnet.all_lemma_names():
#     print(k)
all_lemma_list = [k for k in wordnet.all_lemma_names()]
print(len(all_lemma_list))

# for k in wordnet.all_synsets():
#     print(k)
# all_synsets_list = [k for k in wordnet.all_synsets()]
# print(len(all_synsets_list))


sys.exit()

stopwords = nltk.corpus.stopwords.words('english')

freqdic = {}
freqtoklistdic = {}
freqfrqlistdic = {}

Example #57

0

Show file

File: __init__.py Project: clips/pattern

        try:
            word = word.encode("utf-8", "ignore")
        except:
            pass
    for k, v in DIACRITICS.items():
        for v in v:
            word = word.replace(v, k)

    # Replace spaces with underscores
    word = word.replace(" ", "_")

    return word

### SYNSET #########################################################################################

NOUNS = lambda: wn.all_lemma_names(wn.NOUN)
VERBS = lambda: wn.all_lemma_names(wn.VERB)
ADJECTIVES = lambda: wn.all_lemma_names(wn.ADJ)
ADVERBS = lambda: wn.all_lemma_names(wn.ADV)

NOUN, VERB, ADJECTIVE, ADVERB = \
    NN, VB, JJ, RB = \
        "NN", "VB", "JJ", "RB"

_pattern2wordnet = {NN : wn.NOUN, VB : wn.VERB, JJ : wn.ADJ, RB: wn.ADV}
_wordnet2pattern = {v : k for k, v in _pattern2wordnet.items()}
_wordnet2pattern[wn.ADJ_SAT] = JJ


def synsets(word, pos=NOUN):
    """ Returns a list of Synset objects, one for each word sense.

Example #58

0

Show file

File: 0820_s_synonyms.py Project: boisvert42/npr-puzzle-python

"""
import sys
sys.path.append('..')
from nprcommontools import alpha_only, sort_string, get_synonyms
from nltk.corpus import wordnet as wn, brown
from collections import defaultdict

def is_adjective(word):
    ''' Check if the word is an adjective, per Wordnet '''
    syns = wn.synsets(word)
    for s in syns:
        if s.pos() in ('a','s'):
            return True
    return False

words5 = frozenset(x for x in brown.words() if len(x) == 5 and x.isalpha() and x.startswith('s') and is_adjective(x))
words4 = frozenset(x for x in brown.words() if len(x) == 4 and x.isalpha() and 's' in x and is_adjective(x))
words9 = dict((sort_string(alpha_only(x)),x) for x in wn.all_lemma_names() if len(alpha_only(x)) == 9)
#%%
for word in sorted(words5):
    syns = get_synonyms(word)
    for syn in syns.intersection(words4):
        s1 = sort_string('a' + word[1:] + syn)
        #if s1:
        #    print word, syn
        try:
            new_word = words9[s1]
            print word, syn, new_word
        except KeyError:
            pass