Example #1
0
def _get_words():
    nltk_path = os.path.join(os.path.dirname(__file__), 'nltk_data')
    if nltk_path not in nltk.data.path:
        nltk.data.path.insert(0, nltk_path)

    verbs = sorted({
        word.lower()
        for word in filter(_filter_words, wordnet.all_lemma_names(
            wordnet.VERB))
    })
    verb_size = len(verbs)

    nouns = sorted({
        word.lower()
        for word in filter(_filter_words, wordnet.all_lemma_names(
            wordnet.NOUN))
    })
    noun_size = len(nouns)

    adjs = sorted({
        word.lower()
        for word in filter(_filter_words, wordnet.all_lemma_names(wordnet.ADJ))
    })
    adj_size = len(adjs)

    conjs = sorted([
        'and', 'or', 'lest', 'till', 'nor', 'but', 'yet', 'so', 'unless',
        'when'
    ])
    conj_size = len(conjs)

    return (verbs, verb_size, nouns, noun_size, adjs, adj_size, conjs,
            conj_size)
def exact_matching():
    spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)
def similar_matching():
    """Matches ignoring accent."""
    spa_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)
Example #4
0
 def lemma_names() -> Dict[str, List[str]]:
     return merge_lemmas(
         ("cmn", wordnet.all_lemma_names(lang="cmn")),
         (
             "qcn",
             (get_opencc().convert(l)
              for l in wordnet.all_lemma_names(lang="qcn")),
         ),
         ("qwc", wordnet.all_lemma_names(lang="qwc")),
     )
Example #5
0
    def test_iterable_type_for_all_lemma_names(self):
        # Duck-test for iterables.
        # See https://stackoverflow.com/a/36230057/610569
        cat_lemmas = wn.all_lemma_names(lang='cat')
        eng_lemmas = wn.all_lemma_names(lang='eng')

        self.assertTrue(hasattr(eng_lemmas, '__iter__'))
        self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)

        self.assertTrue(hasattr(cat_lemmas, '__iter__'))
        self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
Example #6
0
    def test_iterable_type_for_all_lemma_names(self):
        # Duck-test for iterables.
        # See https://stackoverflow.com/a/36230057/610569
        cat_lemmas = wn.all_lemma_names(lang='cat')
        eng_lemmas = wn.all_lemma_names(lang='eng')

        self.assertTrue(hasattr(eng_lemmas, '__iter__'))
        self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)

        self.assertTrue(hasattr(cat_lemmas, '__iter__'))
        self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
        self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
Example #7
0
def generate_word_graph(hyp, poly, holo, type):
    if type == 0:
        G1 = snap.TUNGraph.New()
    else:
        G1 = snap.TNGraph.New()
    hypedges = set()
    holoedges = set()
    polyedges = set()
    idToLemma = dict()
    lemmaToId = dict()
    count = 0
    for lemma_name in list(wn.all_lemma_names('n')):
        G1.AddNode(count)
        idToLemma[count] = lemma_name
        lemmaToId[lemma_name] = count
        count += 1
    for lemma_name in list(wn.all_lemma_names('n')):
        if hyp:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.hyponyms() + synset.instance_hyponyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        if type in [0, 1]:
                            G1.AddEdge(lemmaToId[lemma_name],
                                       lemmaToId[lemma_name2])
                            hypedges.add((lemmaToId[lemma_name],
                                          lemmaToId[lemma_name2]))
                        else:
                            G1.AddEdge(lemmaToId[lemma_name2],
                                       lemmaToId[lemma_name])
                            hypedges.add((lemmaToId[lemma_name2],
                                          lemmaToId[lemma_name]))
        if poly:
            for synset in wn.synsets(lemma_name, "n"):
                for lemma_name2 in synset.lemma_names():
                    lemma_name2 = lemma_name2.lower()
                    G1.AddEdge(lemmaToId[lemma_name], lemmaToId[lemma_name2])
                    polyedges.add(
                        (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
        if holo:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.member_holonyms() + synset.part_holonyms(
                ) + synset.substance_holonyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        G1.AddEdge(lemmaToId[lemma_name],
                                   lemmaToId[lemma_name2])
                        hypedges.add(
                            (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
    snap.DelSelfEdges(G1)
    return G1, idToLemma, lemmaToId, hypedges, polyedges, holoedges
Example #8
0
 def get_all_lemmas(self, replace_ws=True):
     all_wn_lemmas = list(wn.all_lemma_names())
     if replace_ws:
         all_wn_lemmas = [
             lemma.replace('_', ' ') for lemma in all_wn_lemmas
         ]
     return all_wn_lemmas
def main(argv):
  huang_vocab = LoadHuang()
  manaal_vocab = LoadManaal()
  brown_vocab = LoadBrown()

  all_lemmas = {x.lower() for x in wn.all_lemma_names(pos=wn.ADJ)}
  all_alpha_lemmas = {x for x in all_lemmas if x.isalpha()}
  all_synsets = set(wn.all_synsets(pos=wn.ADJ))
  all_alpha_synsets = {x for x in all_synsets if IsAlphaSS(x)}
  all_lemmas_with_single_synset = {x for x in all_lemmas if IsSingleSynset(x)}
  all_lemmas_ambig_synset = {x for x in all_lemmas if not IsSingleSynset(x)}
  all_lemmas_with_single_synset_alpha = {x for x in all_lemmas_with_single_synset if x.isalpha()}
  all_lemmas_ambig_synset_alpha = {x for x in all_lemmas_ambig_synset if x.isalpha()}
  all_alpha_lemmas_has_noun = {x for x in all_alpha_lemmas if LemmaHasNoun(x)}
  all_alpha_lemmas_has_noun_single_lexname = {x for x in all_alpha_lemmas_has_noun if IsNounSingleLexName(x) }
  print "all_lemmas:", len(all_lemmas)
  print "all_alpha_lemmas:", len(all_alpha_lemmas)
  print "all_synsets:", len(all_synsets)
  print "all_alpha_synsets:", len(all_alpha_synsets)
  print "all_lemmas_with_single_synset:", len(all_lemmas_with_single_synset)
  print "all_lemmas_ambig_synset:", len(all_lemmas_ambig_synset)
  print "all_lemmas_with_single_synset_alpha", len(all_lemmas_with_single_synset_alpha)
  print "all_lemmas_ambig_synset_alpha", len(all_lemmas_ambig_synset_alpha)
  print "all_alpha_lemmas_has_noun", len(all_alpha_lemmas_has_noun)
  print "all_alpha_lemmas_has_noun_single_lexname", len(all_alpha_lemmas_has_noun_single_lexname)
  print "huang.intersect(all_alpha_lemmas)", len(huang_vocab.intersection(all_alpha_lemmas))
  print "manaal.intersect(all_alpha_lemmas)", len(manaal_vocab.intersection(all_alpha_lemmas))
  print "brown.intersect(all_alpha_lemmas)", len(brown_vocab.intersection(all_alpha_lemmas))
  print "huang*manaal*brown*all_alpha_lemmas", len(huang_vocab.intersection(all_alpha_lemmas, manaal_vocab, brown_vocab))
  print "huang.intersect(all_lemmas_with_single_synset_alpha)", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "manaal.intersect(all_lemmas_with_single_synset_alpha)", len(manaal_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "brown.intersect(all_lemmas_with_single_synset_alpha)", len(brown_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "huang*manaal*brown*all_lemmas_with_single_synset_alpha", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha, manaal_vocab, brown_vocab))
Example #10
0
def generateKnownPatterns():
    from nltk.corpus import brown, reuters, words, wordnet
    from string import ascii_lowercase as ALPH

    patterns = {}

    wordlist = sorted(
        set([x.lower()
             for x in brown.words()] + [x.lower() for x in reuters.words()] +
            [x.lower() for x in words.words()] +
            [x.lower() for x in wordnet.all_lemma_names()]))
    for word in list(wordlist):
        if any(x not in ALPH for x in word):
            wordlist.remove(word)
    with open("static/txt/wordlist.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(wordlist))

    for word in wordlist:
        p = pattern(word)
        if p in patterns:
            patterns[p].append(word)
        else:
            patterns[p] = [word]

    with open("static/txt/patterns.json", "w", encoding="utf-8") as f:
        json.dump(patterns, f)
def extractConditionAttribute(nouns,attributesList):
    greaterThanList = ['greater', 'bigger', 'higher', 'great', 'more','lesser', 'smaller', 'lower', 'less']
    lesserThanList = ['lesser', 'smaller', 'lower', 'less']
    equalList = ['equal', 'equals', 'same']
    extractedWordsList=[]
    conditionAttributeList=[]
    for word in nouns:
        id=0
        if(word in greaterThanList or word in lesserThanList or word in equalList):
            id=nouns.index(word)
            attributeName=nouns[id-1]
            extractedWordsList.append(attributeName)
            nouns.remove(nouns[id])
    for word in extractedWordsList:
        if (word in attributesList):
            conditionAttributeList.append(word)
        else:
            for att in attributesList:
                lemmas = set(wordnet.all_lemma_names())
                if (att in lemmas and word in lemmas):
                    #print("true")
                    s1 = wn.synsets(att)[0]
                    s2 = wn.synsets(word)[0]
                    sim = s1.wup_similarity(s2)
                    if sim >= 0.8:
                        conditionAttributeList.append(att)
    return conditionAttributeList
def get_antonyms(remove_duplicates=False):
    antonyms = []
    lemmas_with_antonyms = []
    lemma_antonym_pairs = []
    for l in wn.all_lemma_names():
        synsets = wn.synsets(l)
        for s in synsets:
            for s_l in s.lemmas():
                if s_l.name() != l:
                    # Here I only care about the antonyms of the current lemma
                    continue

                found_antonym = False
                for a in s_l.antonyms():
                    antonyms.append(a)
                    found_antonym = True

                    if remove_duplicates:
                        lemma_antonym_pairs.append(frozenset((s_l, a)))
                    else:
                        lemma_antonym_pairs.append((s_l, a))

                if found_antonym:
                    lemmas_with_antonyms.append(s_l)

    if remove_duplicates:
        lemma_antonym_pairs = list(set(lemma_antonym_pairs))

    return antonyms, lemmas_with_antonyms, lemma_antonym_pairs
Example #13
0
 def extract(self, blabla, max_pick_word):
     """
         Extract wordnet nouns (or proper noun) from a blabla, which are not on the memory, nor on the selGraph, nor in EXCLUDED
         #TODO: TAKE ALSO WIKIPEDIA STILL
         Self Quest bounded to a maximum of max_pick_word to avoid too long wait. Beware, of found wikipediable word!
         Beware of upper or lower letters which can create conflicts.
         #TODO: Test Edge Cases and memor
     """
     OKWordnet = []
     wn_lemmas = set(
         wordnet.all_lemma_names())  #TODO: SHALL LOAD IT ONLY ONCE???
     if len(blabla) == 0:  #empty
         self.log.info("No new words to grow from.")
     else:
         counter = 0  #count words added
         for word, pos in nltk.pos_tag(word_tokenize(blabla)):
             if counter < max_pick_word:  #Stop once has enough words
                 if pos in ['NN', 'NNS', 'NNP']:
                     if not word.isupper(
                     ):  #To avoid turning words like AI lower case. Else turn to lower case. Ex: donald_trump
                         word = word.lower()
                     #TODO: Need Lemmatizer to avoid words which have same roots?
                     if ((word in wn_lemmas) or
                         (wikipedia.page(word).exists())
                         ) and not (word in OKWordnet):
                         if word in self.graph.keys(
                         ):  #Word is there, augment its weight.
                             self.graph[word][0] = self.graph[word][0] * 1.1
                         else:  #TODO: Shall exclude memory ?
                             OKWordnet.append(word)
                             counter += 1
         #Special case of duo words for wikipedia, such as global_warming https://en.wikipedia.org/wiki/Global_warming
         #TODO: FOR THESE, use wikipedia!?
         wordList = blabla.split(
         )  #then need word.strip(string.punctuation)
         token_list = nltk.pos_tag(word_tokenize(blabla))
         counter = 0
         for token1, token2 in zip(token_list,
                                   token_list[1:]):  #Consecutive token
             word1, pos1 = token1
             word2, token2 = token2
             if counter < max_pick_word and len(word1) > 1 and len(
                     word2) > 1 and (word1 not in EXCLUDED) and (
                         word2
                         not in EXCLUDED):  #Stop once has enough words
                 if not word1.isupper(
                 ):  #lower letter unless fully upper letter:check for proper noun
                     word1 = word1.lower()
                 if not word2.isupper(
                 ):  #lower letter unless fully upper letter
                     word2 = word2.lower()
                 duo = word1 + " " + word2
                 if wikipedia.page(duo).exists() and not (duo in OKWordnet):
                     if duo in self.graph.keys():
                         self.graph[duo][0] = self.graph[duo][0] * 1.1
                     else:
                         OKWordnet.append(duo)
                         counter += 1
         self.log.info("New words to learn from")  #+OKWordnet)
     return OKWordnet
Example #14
0
def init_dictionary():
    # Note: I also tried adding the words from /usr/share/dict/web,
    # but the only additional words it had that were not already in
    # the next two dicts were people's proper names. Not useful.
    global DICTIONARY
    DICTIONARY = set(nltk_words.words())
    DICTIONARY.update(nltk_wordnet.all_lemma_names())
Example #15
0
 def gloss(self, word):
     wn_lemmas = set(wn.all_lemma_names())
     if word in wn_lemmas:
         syn = wn.synsets(word)[0]
         return syn.definition()
     else:
         return None
Example #16
0
def polysemy_analysis():
	"""returns the average polysemy (number of senses) for nouns, verbs, adjectives, and adverbs in wordnet"""
	conditions = (wn.NOUN, wn.VERB, wn.ADJ, wn.ADV)
	cfd = nltk.ConditionalFreqDist((pos, len(wn.synsets(lemma_name, pos)))
		for pos in conditions
		for lemma_name in wn.all_lemma_names(pos))
	for pos in cfd.conditions():
		print "{0:2s} {1:10f}".format(pos, sum([item[0] * item[1] for item in cfd[pos].items()]) * 1.0 / cfd[pos].N())
def polysemy_meth2(word_class):
    print("Computing average polysemy of word class(" + word_class +
          ") .........")
    polysemy_count = 0
    all_lemma_names = set(wordnet.all_lemma_names(word_class))
    for name in all_lemma_names:
        polysemy_count += len(wordnet.synsets(name, word_class))
    return polysemy_count / len(all_lemma_names)
def filterTokensUsingNltkWordnet():
    # for k in wordnet.all_lemma_names():
    #     print(k)
    all_lemma_list = [k for k in wordnet.all_lemma_names()]
    
    tokset = set(freqtoklistdic["all"])
    wntset = set(all_lemma_list)
    iset = tokset.intersection(wntset)
    inwordnet = len(iset)
Example #19
0
def synset_senses(text):
    synsets = []
    wn_lemmas = set(wn.all_lemma_names())
    for word, tag in text:
        lemma = lemmatizer.lemmatize(word, tag)
        if lemma in wn_lemmas:
            words = (list(swn.senti_synsets(word))[0])
            synsets.append(words)
    return synsets
Example #20
0
 def all(self):
     thai_wn_words = []
     thai_wn_lemmas = [x for x in wn.all_lemma_names(lang='tha')]
     for word in thai_wn_lemmas:
         meanings = wn.synsets(word, lang='tha')
         word = word.replace('_', ' ')
         for meaning in meanings:
             thai_wn_words.append((word, meaning.lemma_names()[0]))
     indices = list(range(len(thai_wn_words)))
     return thai_wn_words, indices
Example #21
0
def print_sentence(word):
    if word in word_in_sentence:
        if word in wn.all_lemma_names():
            return (wn.synsets(word)[0].examples())
        else:
            msg = ["cannot find example sentence for this word"]
            return msg
    else:
        msg = ["cannot find example sentence for this word"]
        return msg
Example #22
0
    def __init__(self, exwordnet, name, pos, lang='eng'):
        self._exwordnet = exwordnet
        if name not in wn.all_lemma_names(pos=pos, lang=lang):
            raise exWordNetError('word %s.%s in %s is not defined in WordNet' %
                                 (name, pos, lang))

        self._name = name
        self._pos = pos
        self._lang = lang
        self._key = '%s.%s.%s' % (name, pos, lang)
Example #23
0
    def all_words(self, pos=None, lang='eng'):
        if pos is None:
            pos_tags = ['n', 'v', 'a', 'r']
        else:
            pos_tags = [pos]

        for pos_tag in pos_tags:
            for l in wn.all_lemma_names(pos=pos_tag, lang=lang):
                word = Word(self, l, pos_tag, lang=lang)
                yield word
Example #24
0
def generate_word_graph(hyp, poly, holo):
    G1 = nx.Graph()
    hypedges = set()
    holoedges = set()
    polyedges = set()
    idToLemma = dict()
    lemmaToId = dict()
    count = 0
    for lemma_name in list(wn.all_lemma_names('n')):
        G1.add_node(count)
        idToLemma[count] = lemma_name
        lemmaToId[lemma_name] = count
        count += 1
    for lemma_name in list(wn.all_lemma_names('n')):
        if hyp:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.hyponyms() + synset.instance_hyponyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        G1.add_edge(lemmaToId[lemma_name],
                                    lemmaToId[lemma_name2])
                        hypedges.add(
                            (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
        if poly:
            for synset in wn.synsets(lemma_name, "n"):
                for lemma_name2 in synset.lemma_names():
                    lemma_name2 = lemma_name2.lower()
                    G1.add_edge(lemmaToId[lemma_name], lemmaToId[lemma_name2])
                    polyedges.add(
                        (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
        if holo:
            for synset in wn.synsets(lemma_name, "n"):
                for synset2 in synset.member_holonyms() + synset.part_holonyms(
                ) + synset.substance_holonyms():
                    for lemma_name2 in synset2.lemma_names():
                        lemma_name2 = lemma_name2.lower()
                        G1.add_edge(lemmaToId[lemma_name],
                                    lemmaToId[lemma_name2])
                        hypedges.add(
                            (lemmaToId[lemma_name], lemmaToId[lemma_name2]))
    G1.remove_edges_from(G1.selfloop_edges())
    return G1, idToLemma, lemmaToId, hypedges, polyedges, holoedges
Example #25
0
def process():
    # Install Open Multilingual Wordnet if not already installed.
    nltkd = nltk.downloader.Downloader()
    if not nltkd.is_installed('omw'):
        nltk.download('omw')

    # Figure out ISO 639-2 code for specified locale. Exit if unavailable.
    print args.language
    iso639_2 = langcodes.best_match(args.language, wn.langs())[0]
    print iso639_2
    print wn.langs()
    if iso639_2 == 'und': # Nearest ISO 639-2 code is undefined.
        exit("Requested language is not available on this NLTK Wordnet installation.")

    # Obtain set of lowercased lemmas that belong to only one part of speech.
    posdb = dict()
    single_pos_lemmas = set()
    for pos in ['a', 'r', 'n', 'v']:
        posdb[pos] = set()
        # Note: wn.all_lemma_names() returns the lemma names in all lowercase.
        # To remove lemmas that are sometimes or always capitalised in normal
        # writing (e.g. "China" or "Arulo"), we will need to obtain capitalised
        # lemmas from Wordnet later on, and remove members of our output set
        # that are identical to the lowercased transformation of those
        # capitalised lemmas.
        for lemma in wn.all_lemma_names(pos=pos, lang=iso639_2):
            posdb[pos].add(lemma)
        single_pos_lemmas.symmetric_difference_update(posdb[pos])

    # Remove lemmas containing characters other than a-z.
    output_set = set()
    for term in single_pos_lemmas:
        if non_word.search(term) != None:
            continue
        output_set.add(term)

    # Obtain a set of lemmas that are typically capitalised in normal writing.
    unlowered_lemmas = set()
    for synset in list(wn.all_synsets()):
        for lemma in synset.lemma_names():
            unlowered_lemmas.add(lemma)
    for word in output_set:
        lemmas =
    # Filter inspiration: http://stackoverflow.com/a/16562558
    output_set = filter(lambda x:len(x) > 4 and len(x) < 7, output_set)
    names_lowered = set()
    for name in nltk.corpus.names.words():
        names_lowered.add(name.lower())
    output_set = filter(lambda x: x not in names_lowered, output_set)
    print output_set
    # print single_pos_lemmas
    print len(single_pos_lemmas)
    print len(output_set)
Example #26
0
    def extractWordsAndSynsets(self, filenameWords, filenameSynsets,  filenameLexemes):
        #file
        fWords = codecs.open(filenameWords, 'w', 'utf-8')
        fSynsets = codecs.open(filenameSynsets, 'w',  'utf-8')
        fLexemes = codecs.open(filenameLexemes, 'w',  'utf-8')

        wordCounter = 0
        wordCounterAll = 0
        synsetCounter = 0
        synsetCounterAll = 0
        lexemCounter = 0
        lexemCounterAll = 0

        ovv = []

        for pos in self.pos_list:
            for word in wn.all_lemma_names(pos=pos, lang=self.lang):
                wordCounterAll += 1
                self.WordIndex[word] = wordCounterAll
                fWords.write(word+" ")
                synsetInWord = 0
                for synset in wn.synsets(word, lang=self.lang):
                    lexemCounterAll += 1
                    synsetId = synset.name()
                    if self.Shared.in_vocab(synsetId):
                        synsetInWord += 1
                        if synsetId not in self.SynsetIndex:
                            fSynsets.write(synsetId + " " + self.Shared.getVectorAsString(self.Shared.model[synsetId]) + "\n")
                            synsetCounter += 1
                            self.SynsetIndex[synsetId] = synsetCounter

                        lexemCounter += 1
                        #lemma name
                        sensekey = wn.lemma(synset.name()+'.'+word).key()

                        fWords.write(sensekey + ",")
                        fLexemes.write(str(self.SynsetIndex[synsetId]) + " " + str(wordCounterAll) + "\n")
                    else:
                        ovv.append(synsetId)


                fWords.write("\n")
                if synsetInWord is not 0:
                    wordCounter += 1
                else:
                    self.WordIndex[word] = -1
        fWords.close()
        fSynsets.close()
        fLexemes.close()
        print("   Words: %d / %d\n" % (wordCounter, wordCounterAll))
        print("  Synset: %d / %d\n" % (synsetCounter, synsetCounter + len(ovv)))
        print("  Lexems: %d / %d\n" % (lexemCounter, lexemCounterAll))
Example #27
0
def trans_verb_list():
    '''Generate a list of transitive verbs.'''
    transitive_verbs = []
    for word in wordnet.all_lemma_names('v'):
        frame_ids = set()
        for lem in wordnet.lemmas(word, 'v'):
            frame_ids.update(lem.frame_ids())
        # Verbs with these frames make sense for our sentences.
        if frame_ids.intersection({8, 9, 10, 11}):
            transitive_verbs.append(word)

    # Remove duplicates by converting to set and back in case of
    # malicious WordNet.
    return list(set(transitive_verbs))
Example #28
0
def retrieve_unambiguous(extraction_dictionary,
                         with_example=True,
                         keep_mwe=True):
    """
    read unambiguous definitions from nltk wordnet
    """
    lemma_data = set()
    if keep_mwe:
        lemma_data = {(lm.lower(), lg)
                      for lg in extraction_dictionary
                      for p in extraction_dictionary[lg]
                      for lm in wn.all_lemma_names(lang=lg, pos=p)}
    else:
        lemma_data = {(lm.lower(), lg)
                      for lg in extraction_dictionary
                      for p in extraction_dictionary[lg]
                      for lm in wn.all_lemma_names(lang=lg, pos=p)
                      if "_" not in lm}
    for lemma, lang in lemma_data:
        synsets = wn.synsets(lemma)
        if len(synsets) == 1:
            synset = synsets[0]
            if with_example:
                examples = {
                    ex
                    for ex in synset.examples()
                    if lemma_name in set(ex.split())
                }
                if not len(examples):
                    yield lemma, lang, synset.pos(), synset.definition(), ""
                else:
                    for example in examples:
                        yield lemma, lang, synset.pos(), synset.definition(
                        ), example
            else:
                yield lemma, lang, synset.pos(), synset.definition()
Example #29
0
def get_rev_map(lang):
    if lang not in WORDNET_FILTERS:
        return lambda x: x
    filter = WORDNET_FILTERS[lang]

    def rev_map(x):
        return _rev_maps[lang].get(x, x)

    if lang not in _rev_maps:
        m = {}
        for lemma in wordnet.all_lemma_names(lang=lang):
            filtered_lemma = filter(lemma)
            m[filtered_lemma] = lemma
        _rev_maps[lang] = m
    return rev_map
 def _is_object(self, word):
     wn_lemmas = set(wordnet.all_lemma_names())
     lemmatizer = WordNetLemmatizer()
     word = lemmatizer.lemmatize(word)
     str_var = "".join(word + ".n.01")
     if word in wn_lemmas and str_var in [
             syn.name() for syn in wordnet.synsets(word)
     ]:
         w = wordnet.synset(str_var)
         synset_obj = wordnet.synset("object.n.01")
         if synset_obj.wup_similarity(w) >= OBJECT_SIMILARITY:
             return True
         else:
             return False
     return False
Example #31
0
def average_polysemy(part_of_speech):

    # 1. aggregate all lemmas in wordnet that have the given POS (hint: use nltk.corpus.wordnet.all_lemma_names)
    # 2. sum the number of meanings of each lemma (restricted to the given POS)
    # 3. return the average polysemy of a given POS

    all_lemmas = set(wn.all_lemma_names(part_of_speech))

    #print(len(all_lemmas))

    meanings_length = 0
    for lemma in all_lemmas:
        meanings = wn.synsets(lemma, part_of_speech)
        meanings_length = meanings_length + len(meanings)

    return meanings_length / len(all_lemmas)
Example #32
0
    def filterWN(self):
        '''
        From self.words takes the words that are in WordNet and save them in
        self.filtered_words.
        :return: None
        '''
        self.logger.info('Starting filtrated with WordNet')

        wn_lemmas = set(wn.all_lemma_names())
        for j in self.words:
            if j in wn_lemmas:
                self.filtered_words.append(j)

        self.filtered_words = list(set(self.filtered_words))

        self.logger.info('Finished filtrated with WordNet')
Example #33
0
 def synset_ratio(self):
     new_var = "vtn_synsets"
     wn_lemmas = [l.encode('ascii') for l in wn.all_lemma_names()]
     has_item = self.compare_items(wn_lemmas)
     new_column = []
     if False in has_item:
         self._warning_msg('synset_ratio', wn_lemmas)
     for record, exists in zip(self._dict, has_item):
         if exists:
             n_synsets = len(wn.synsets(record, 'n'))
             v_synsets = len(wn.synsets(record, 'v'))
             vtn_synsets = float(v_synsets) / (n_synsets + v_synsets)
         else:
             vtn_synsets = None
         new_column.append(vtn_synsets)
     self._append_column(new_column, new_var)
Example #34
0
def exercise4():
    words = [n for n in wn.all_lemma_names()]
    more_than_one = []
    for word in words:

        tags = []
        for synset in wn.synsets(word):
            dot_location = synset.name().find('.')
            if synset.name()[0:dot_location] == word:
                tags.append(synset.name()[dot_location + 1:dot_location + 2])
        if len((set(tags))) > 1:
            more_than_one.append(word)

    print("More than one tagged words:", len(more_than_one))
    print("Total:", len(words))
    print("Percentage is:", (len(more_than_one) / len(words)) * 100)
Example #35
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    msg.text("Preprocessing text...")
    texts = [line.rstrip() for line in open(in_file, 'r')]
    docs = nlp.pipe(texts, n_process=n_process)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    wn_lemmas = set(wordnet.all_lemma_names())
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            # print(doc)
            spans = get_phrases(doc, wn_lemmas)
            spans = filter_spans(spans)
            # print('NOUN SPAN', str(spans))
            doc = merge_phrases(doc, spans)
            spans = get_adjective_phrases(doc)
            spans = filter_spans(spans)
            # print('ADJ SPAN', str(spans))
            # print('*-----------------------------------------*')
            doc = merge_phrases(doc, spans)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
Example #36
0
def generate_irrelevant(file_name='irrelevent_min.txt'):
    #generate irrelevant word pairs, will be used as negative label
    all_lemma = [i for i in wn.all_lemma_names() if '_' not in i]
    length = len(all_lemma)
    count = 0
    f = open(file_name, 'wb')
    for i in wn.all_synsets():
        m = len(i.lemma_names())/3+1
        for j in range(m):
            current_word = i.lemma_names()[random.randint(0, len(i.lemma_names())-1)]
            if '_' not in current_word:
                for j in range(2):
                    word = all_lemma[random.randint(0,length-10)]
                    if word not in i.lemma_names():
                        count+=1
                        f.write(current_word+', '+word+'\n')
    print count
    f.close()
Example #37
0
 def wordnet_synsets(self):
     """Number of WordNet "synsets" (roughly, senses) for the given word.
     This variable collapses across different parts of speech, meanings,
     etc.
     """
     new_var = "n_synsets"
     wn_lemmas = [l.encode('ascii') for l in wn.all_lemma_names()]
     has_item = self.compare_items(wn_lemmas)
     new_column = []
     if False in has_item:
         self._warning_msg('wordnet_synsets', wn_lemmas)
     for record, exists in zip(self._dict, has_item):
         if exists:
             n_synsets = len(wn.synsets(record))
         else:
             n_synsets = None
         new_column.append(n_synsets)
     self._append_column(new_column, new_var)
Example #38
0
def monosemous():
    all_lemmas = list(wn.all_lemma_names())
    print "First ten lemmas: ", all_lemmas[:10]
    print "All lemmas: ", len(list(all_lemmas))

    # Only interested in single words
    single_word_lemmas = [x for x in all_lemmas if not '_' in x]
    print "First ten single word lemmas: ", single_word_lemmas[:10]
    print "Single word lemmas: ", len(single_word_lemmas)

    # Find monosemous
    monosemous = [x for x in single_word_lemmas if len(wn.synsets(x)) == 1]
    print monosemous[:10]
    senses = [(x, wn.synsets(x)[0]) for x in monosemous]
    noun_senses = [x for x in senses if x[1].pos == 'n']
    print "First ten monosemous nouns: ", noun_senses[:10]
    print "Number of monosemous nouns: ", len(noun_senses)
    return noun_senses
Example #39
0
def main(output):
    pronouns = {}
    for lemma in wordnet.all_lemma_names(lang="fin"):
        if "_" not in lemma:
            continue
        for b in lemma.split("_"):
            if b not in ALL_ABBRVS:
                continue
            pronouns.setdefault(b, []).append(lemma)
    if output == "group":
        for p, v in sorted(pronouns.items()):
            print(p, v)
    elif output == "abbrv":
        for p, v in sorted(pronouns.items()):
            print(p)
    elif output == "pron":
        for v in sorted((v for lemma in pronouns.values() for v in lemma)):
            print(v)
def getTableName(word, tables):
    tableList = ''
    for table in tables:
        if word == table:
            tableList = table
        else:
            s1Lemmas = set(wordnet.all_lemma_names())
            if table in s1Lemmas and word in s1Lemmas:
                s1 = wn.synsets(table)[0]
                s2 = wn.synsets(word)[0]
                sim = s1.wup_similarity(s2)
                if sim is None:
                    return
                elif sim >= 0.8:
                    tableList = table
                else:
                    return

    return tableList
Example #41
0
def words_of_type(word_type, min_frequency=4):
    '''
    Generate a list of words of WordNet word_type that have a total frequency of at least
    min_frequency times across all senses of the word with word_type.
    '''
    try:
        with open(word_type + '.' + str(min_frequency), 'r') as file:
            return file.read().split('\n')
    except:
        words = []
        for word in wordnet.all_lemma_names(wordnet.__getattribute__(word_type)):
            counts = [lem.count() for lem in wordnet.lemmas(word, wordnet.__getattribute__(word_type))]

            if sum(counts) >= min_frequency:
                words.append(word)

        words = [item for item in words if not item.isdigit()]

        with open(word_type + '.' + str(min_frequency), 'w') as file:
            # Remove duplicates by converting to set and back in case of
            # malicious WordNet.
            file.write('\n'.join(list(set(words))))

        return words
Example #42
0
def noun_vocab(tcm=None, postagger=None, min_length=0):
    """
    Return all nouns found in wordnet in both singular and plurar forms,
    along with POS tag and synset (as given by a TreeCutModel instance).
    """
    if not postagger:
        postagger = BackoffTagger()

    getpostag = lambda word : postagger.tag([word])[0][1]
    singular_n_pos = getpostag("house")
    plural_n_pos   = getpostag("houses")

    nouns = set()

    for lemma in wn.all_lemma_names(pos = 'n'):
        if len(lemma) < min_length:
            continue
        if '_' in lemma:
            continue

        plural = None
        if lemma[-1] != 's':
            plural = pluralize(lemma)
            # use the the plural only if it still enable us to
            # get to the synsets (some words shouldn't be pluralized)
            if len(wn.synsets(plural)) == 0:
                plural = None

        for syn in wn.synsets(lemma, 'n'):
            classes = tcm.predict(syn) if tcm is not None else [syn.name()]
            for classy in classes:
                nouns.add((lemma, singular_n_pos, classy))
                if plural is not None:
                    nouns.add((plural, plural_n_pos, classy))

    return nouns
Example #43
0
def diversity(pos):
    lemmas = list(wordnet.all_lemma_names(pos=pos))
    count = 0
    for word in lemmas:
        count = count + len(wordnet.synsets(word, pos))
    return count / len(lemmas)
#!/usr/bin/env python
"""
NPR 2017-12-31
https://www.npr.org/2017/12/31/574287856/sunday-puzzle-new-names-in-the-news

Name a famous singer — 3 letters in the first name, 5 letters in the last. 
Drop the middle letter of the last name and rearrange the result to name a 
variety of singing group. What is it?
"""
#%%
import sys
sys.path.append('..')

from nprcommontools import get_famous_names, sort_string
from nltk.corpus import wordnet as wn

singing_groups = dict()
for x in wn.all_lemma_names():
    ss = sort_string(x.replace('_','').lower())
    if len(ss) == 7:
        singing_groups[ss] = singing_groups.get(ss,[]) + [x]

names = [x for x in get_famous_names(80).keys() if x.count(' ') == 1]
names = [x for x in names if len(x.split(' ')[0])==3 and len(x.split(' ')[1])==5]

for name in names:
    fn,ln = name.lower().split(' ')
    ln = ln[:2] + ln[-2:]
    if sort_string(fn+ln) in singing_groups:
        print name, singing_groups[sort_string(fn+ln)]
#!/usr/bin/env python
"""
NPR 2017-08-27
http://www.npr.org/2017/08/27/545580069/sunday-puzzle-categorically-speaking
This week's challenge is a common two-word expression. 
The expression consists of 8 letters and uses all five vowels — A, E, I, O and U. 
It has only three consonants, one of which is repeated. 
The first word in the expression has two letters and the second has six letters. 
What familiar expression is it?
"""

from nltk.corpus import wordnet as wn

#%%
for word in wn.all_lemma_names():
    if len(set(word).intersection(set('aeiou'))) == 5 and len(word) == 9 and word[2] == '_':
        print word
Example #46
0
    element_type = "lemma"
    name = String(nullable=False)

g = Graph()
g.add_proxy("lemma", Lemma)


# >>> g.add_proxy("knows", Knows)
# >>> james = g.people.create(name="James")
# >>> julie = g.people.create(name="Julie")
# >>> g.knows.create(james, julie)

import progressbar as pb

N = 150000
for N, ln in enumerate(wn.all_lemma_names()):
    pass

widgets = [pb.Counter(), '%d rows: ' % N, pb.Percentage(), ' ', pb.RotatingMarker(), ' ', pb.Bar(),' ', pb.ETA()]
pbar = pb.ProgressBar(widgets=widgets, maxval=N).start()

for i, ln in enumerate(wn.all_lemma_names()):
    pbar.update(i)
    lemma = g.lemma.create(name=str(ln))
    #Lemma(ln).save()
pb.finish()

# class Knows(Relationship):

#     label = "knows"
Example #47
0
 def get_all_lemma_names(self):
     return wn.all_lemma_names('n')
'''
NPR Puzzle 2016-09-11

www.npr.org/2016/09/11/493408422/its-a-race-to-the-end-of-the-alphabet

Think of a well-known category with exactly seven things in it. 
Alphabetize the things from their ending letters, and the last 
letter alphabetically will be "e." In other words, no thing in 
this category ends in a letter after "e" in the alphabet. 
It's a category and set of seven things that everyone knows. 
What is it?
'''
import sys
sys.path.append('..')
from nprcommontools import get_category_members
from nltk.corpus import wordnet as wn
#%%
def last_letter_alphabetically(l):
    let = ''
    for x in l:
        if x[-1] > let:
            let = x
    return let

#%%
possible_categories = wn.all_lemma_names()
for cat in possible_categories:
    cat_members = get_category_members(cat)
    if len(cat_members) == 7 and last_letter_alphabetically(cat_members) == 'e':
        print cat_members
Example #49
0
#!/usr/bin/python
#===================================================================
# This codelet extracts all lemmas from WordNet by looking up
# the synsets for each lemma name and collecting the associated
# lemmas into a set.  Lemmas with a dot in the name are excluded.
# Copyright 2014, IEEE ENCS Humanoid Robot Project
#===================================================================

from __future__ import division
from nltk.corpus import wordnet as wn

lemmas = set()
for lemma_name in wn.all_lemma_names():
	for synset in wn.synsets(lemma_name):
		for lemma in synset.lemmas():
			if lemma.name().find('.') < 0: # exlude lemmas with a dot in the name, because WN cannot parse them
				lemmas.add(synset.name() + '.' + lemma.name())

with open('all_lemmas.txt', 'w') as f:
	for lemma in lemmas:
		f.write(lemma + "\n")
#!/usr/bin/env python
"""
NPR 2017-09-10

Think of a famous quotation with 8 words. 
The initial letters of the first 4 words themselves spell a word, 
and the initial letters of the last 4 words spell another word. 
Both words rhyme with "jab." What quotation is it?
"""

import sys
sys.path.append('..')
import rhyme
from nltk.corpus import brown, wordnet as wn

brown_words = frozenset(brown.words())

#%%
# Check out words that rhyme with "jab"
rhymes_with_jab = [x for x in rhyme.all_rhymes('jab') if len(x) == 4]

# Last word starts and ends with "b", second-to-last with "a"
# (unless "sabb", "tabb", and "nabb" are common words)
b_words = [x for x in wn.all_lemma_names() if x.startswith('b') and x.endswith('b')  and x.isalpha()]
#print [x for x in brown_words if x.startswith('a') and x.endswith('a')]

for w in b_words:
    print [x for x in wn.all_lemma_names() if x.endswith('_{0}'.format(w)) and x.count('_') == 3]
Example #51
0
def all_lemma_names(pos=None, lang="tha"):
	return wordnet.all_lemma_names(pos=pos, lang=lang)
import collections
from operator import itemgetter
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor

wn_lemmas = set()
for lemma in wn.all_lemma_names(pos=wn.ADJ):
  wn_lemmas.add(lemma)

wn_adj_synsets = collections.defaultdict(set)

for word in wn_lemmas:
  for synset in wn.synsets(word, wn.ADJ):
    wn_adj_synsets[synset.name.lower()] = [lemma.lower() for lemma in synset.lemma_names ]

semcor_adjectives = set()
i = 0
for sent in semcor.tagged_sents(tag='both'):
  for c,chk in enumerate(sent):
    if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit() and chk[0].node.startswith('JJ'):
      if len(chk.leaves()) == 1:
        semcor_adjectives.add(chk.leaves()[0].lower())


semcor_synsets = set()
for s, words in wn_adj_synsets.items():
  for w in words:
    if w in semcor_adjectives:
      semcor_synsets.add(s.lower())

vectors = set()
import nltk
from tabulate import tabulate
# Install Open Multilingual Wordnet and Wordnet
# if not already installed.
nltkd = nltk.downloader.Downloader()
for corpus in ['wordnet','omw']:
    if not nltkd.is_installed(corpus):
        nltk.download(corpus)

from nltk.corpus import wordnet as wn

table = list()

for lang in sorted(wn.langs()):
    my_set_of_all_lemma_names = set()
    from nltk.corpus import wordnet as wn
    for aln_term in list(wn.all_lemma_names(lang=lang)):
        for synset in wn.synsets(aln_term):
            for lemma in synset.lemma_names():
                my_set_of_all_lemma_names.add(lemma)
    table.append([lang,
        len(set(wn.all_lemma_names(lang=lang))),
        len(my_set_of_all_lemma_names)])

print tabulate(table,
    headers=["Language code",
        "all_lemma_names()",
        "lemma_name.synset.lemma.lemma_names()"])
Think of a familiar two-word phrase in 8 letters — with 4 letters in each word. 
The first word starts with M. Move the first letter of the second word to the 
end and you'll get a regular 8-letter word, which, amazingly, other than the M, 
doesn't share any sounds with the original two-word phrase. What phrase is it?
'''
import sys
sys.path.append('..')
import nprcommontools as nct
import itertools
import requests
        
from nltk.corpus import wordnet as wn
from nltk.corpus import cmudict

#%%
words = set(x for x in wn.all_lemma_names() if x.count('_') == 0 and x.startswith('m'))
m_phrases = set()
url = 'http://www.codon.org.uk/~mjg59/tmp/wordlists/english_phrases.txt'
r = requests.get(url)
for line in r.content.split('\n'):
    line = line.strip()
    if line.count(' ') == 1 and line.split(' ')[0].startswith('m') and len(line.split(' ')[0]) == 4 and len(line.split(' ')[1]) == 4:
        m_phrases.add(line)

#%%
for m in m_phrases:
    m1,m2 = m.split(' ')
    new_word = m1 + m2[1:] + m2[0]
    if new_word in words:
        print m,new_word
def baseline():
    spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    return [x for x in spa_words & por_words if
            set(wn.synsets(x, lang="spa")) & set(wn.synsets(x, lang="por")) == set()]
Example #56
0
from nltk.corpus import wordnet
import nltk.util
print("NLTK import end")
import logging
# from gensim import corpora, models, similarities

# gensimoutdir = "/XXXprojects/cch/foresight/dat/gensim"

print(words.fileids())
for fid in words.fileids():
    print(fid, len(words.words(fid)))
    print(words.words(fid)[:5], "...", words.words(fid)[-5:])

# for k in wordnet.all_lemma_names():
#     print(k)
all_lemma_list = [k for k in wordnet.all_lemma_names()]
print(len(all_lemma_list))

# for k in wordnet.all_synsets():
#     print(k)
# all_synsets_list = [k for k in wordnet.all_synsets()]
# print(len(all_synsets_list))


sys.exit()

stopwords = nltk.corpus.stopwords.words('english')

freqdic = {}
freqtoklistdic = {}
freqfrqlistdic = {}
Example #57
0
        try:
            word = word.encode("utf-8", "ignore")
        except:
            pass
    for k, v in DIACRITICS.items():
        for v in v:
            word = word.replace(v, k)

    # Replace spaces with underscores
    word = word.replace(" ", "_")

    return word

### SYNSET #########################################################################################

NOUNS = lambda: wn.all_lemma_names(wn.NOUN)
VERBS = lambda: wn.all_lemma_names(wn.VERB)
ADJECTIVES = lambda: wn.all_lemma_names(wn.ADJ)
ADVERBS = lambda: wn.all_lemma_names(wn.ADV)

NOUN, VERB, ADJECTIVE, ADVERB = \
    NN, VB, JJ, RB = \
        "NN", "VB", "JJ", "RB"

_pattern2wordnet = {NN : wn.NOUN, VB : wn.VERB, JJ : wn.ADJ, RB: wn.ADV}
_wordnet2pattern = {v : k for k, v in _pattern2wordnet.items()}
_wordnet2pattern[wn.ADJ_SAT] = JJ


def synsets(word, pos=NOUN):
    """ Returns a list of Synset objects, one for each word sense.
"""
import sys
sys.path.append('..')
from nprcommontools import alpha_only, sort_string, get_synonyms
from nltk.corpus import wordnet as wn, brown
from collections import defaultdict

def is_adjective(word):
    ''' Check if the word is an adjective, per Wordnet '''
    syns = wn.synsets(word)
    for s in syns:
        if s.pos() in ('a','s'):
            return True
    return False

words5 = frozenset(x for x in brown.words() if len(x) == 5 and x.isalpha() and x.startswith('s') and is_adjective(x))
words4 = frozenset(x for x in brown.words() if len(x) == 4 and x.isalpha() and 's' in x and is_adjective(x))
words9 = dict((sort_string(alpha_only(x)),x) for x in wn.all_lemma_names() if len(alpha_only(x)) == 9)
#%%
for word in sorted(words5):
    syns = get_synonyms(word)
    for syn in syns.intersection(words4):
        s1 = sort_string('a' + word[1:] + syn)
        #if s1:
        #    print word, syn
        try:
            new_word = words9[s1]
            print word, syn, new_word
        except KeyError:
            pass