def process(): # Install Open Multilingual Wordnet if not already installed. nltkd = nltk.downloader.Downloader() if not nltkd.is_installed('omw'): nltk.download('omw') # Figure out ISO 639-2 code for specified locale. Exit if unavailable. print args.language iso639_2 = langcodes.best_match(args.language, wn.langs())[0] print iso639_2 print wn.langs() if iso639_2 == 'und': # Nearest ISO 639-2 code is undefined. exit("Requested language is not available on this NLTK Wordnet installation.") # Obtain set of lowercased lemmas that belong to only one part of speech. posdb = dict() single_pos_lemmas = set() for pos in ['a', 'r', 'n', 'v']: posdb[pos] = set() # Note: wn.all_lemma_names() returns the lemma names in all lowercase. # To remove lemmas that are sometimes or always capitalised in normal # writing (e.g. "China" or "Arulo"), we will need to obtain capitalised # lemmas from Wordnet later on, and remove members of our output set # that are identical to the lowercased transformation of those # capitalised lemmas. for lemma in wn.all_lemma_names(pos=pos, lang=iso639_2): posdb[pos].add(lemma) single_pos_lemmas.symmetric_difference_update(posdb[pos]) # Remove lemmas containing characters other than a-z. output_set = set() for term in single_pos_lemmas: if non_word.search(term) != None: continue output_set.add(term) # Obtain a set of lemmas that are typically capitalised in normal writing. unlowered_lemmas = set() for synset in list(wn.all_synsets()): for lemma in synset.lemma_names(): unlowered_lemmas.add(lemma) for word in output_set: lemmas = # Filter inspiration: http://stackoverflow.com/a/16562558 output_set = filter(lambda x:len(x) > 4 and len(x) < 7, output_set) names_lowered = set() for name in nltk.corpus.names.words(): names_lowered.add(name.lower()) output_set = filter(lambda x: x not in names_lowered, output_set) print output_set # print single_pos_lemmas print len(single_pos_lemmas) print len(output_set)
def assert_lang_supported_by_wordnet(lang): lang = iso_639_alpha3(lang) # wordnet needs iso-639-2 if lang in wn.langs(): return True raise ValueError( f"Language '{lang}' not implemented in WordNet. Implemented languages are : {sorted(wn.langs())}" )
def __init__(self, folder, lang): self.folder = folder if lang in wn.langs(): self.lang = lang else: print("language: '%s' is not supported, try another language" % lang) #initialize self.WordCounter = {} self.SynsetCounter = {} self.pos_list = ['a', 's', 'r', 'n', 'v']
def __init__(self, file_name, folder, lang): self.file_name = file_name self.folder = folder if lang in wn.langs(): self.lang = lang else: print("language: '%s' is not supported, try another language" % lang) #initialize self.WordIndex = {} self.SynsetIndex = {} self.pos_list = ['a', 's', 'r', 'n', 'v'] self.Shared = util.Shared()
def __init__(self, file_name, folder, lang): self.file_name = file_name #where word2vec model exists self.folder = folder if lang in wn.langs(): self.lang = lang else: print("language: '%s' is not supported, try another language" % lang) #initialize self.WordGenerality = {} self.pos_list = ['a', 's', 'r', 'n', 'v'] self.model = word2vec.Word2Vec.load(self.file_name)
def all_langs(self, part1_format=False): """ $ python -m sagas.nlu.wordnet_procs all_langs :return: """ from sagas.nlu.locales import iso_locales langs = wn.langs() if part1_format: print(', '.join(sorted(iso_locales.iso_map.keys()))) else: print('total', len(langs)) print(', '.join(sorted(langs))) print(', '.join(sorted(iso_locales.iso_map.keys())))
def __init__(self, file_name, folder, lang): self.file_name = file_name self.folder = folder if lang in wn.langs(): self.lang = lang else: print("language: '%s' is not supported, try another language" % lang) #initialize self.WordIndex = {} self.SynsetIndex = {} self.pos_list = ['a', 's', 'r', 'n', 'v'] self.pointer_map = {"@":"hypernym", "&":"similar", "$":"verbGroup", "!":"antonym"} self.Shared = util.Shared()
def langs(): """ This function return a set of ISO-639 language codes. :return: ISO-639 language codes :rtype: list[str] :Example: >>> from pythainlp.corpus.wordnet import langs >>> langs() ['eng', 'als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno', 'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm'] """ return wordnet.langs()
def nltk_locales(self): """ $ python -m sagas.nlu.locales nltk_locales :return: """ from nltk.corpus import wordnet as wn from iso639 import languages import sagas langs = wn.langs() print(len(langs), sorted(langs)) rs = [] excepts = ['qcn'] for lang in langs: if lang not in excepts: loc = languages.get(part3=lang) rs.append((loc.part3, loc.macro, loc.name)) df=sagas.to_df(rs, ['code', 'micro', 'name']) sagas.print_df(df)
import nltk from nltk.corpus import wordnet as wn from nltk.corpus import wordnet from nltk.tokenize import sent_tokenize multi_lang = sorted(wn.langs()) # import _lang_sel.py usr_in = input('enter one word here: ') tokenized = sent_tokenize(usr_in) for i in tokenized[:2]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print("NLTK Tag = ", tagged) tagged = str(tagged) char1 = "," char2 = ")" tag_str = tagged[tagged.find(char1) + 3:tagged.find(char2) - 1] print("pos_tag = ", tag_str) """ char1 = '(' char2 = ')' mystr = "mystring(123234sample)" print mystr[mystr.find(char1)+1 : mystr.find(char2)] 123234sample """ syns = wordnet.synsets(usr_in)
import nltk from tabulate import tabulate # Install Open Multilingual Wordnet and Wordnet # if not already installed. nltkd = nltk.downloader.Downloader() for corpus in ['wordnet','omw']: if not nltkd.is_installed(corpus): nltk.download(corpus) from nltk.corpus import wordnet as wn table = list() for lang in sorted(wn.langs()): my_set_of_all_lemma_names = set() from nltk.corpus import wordnet as wn for aln_term in list(wn.all_lemma_names(lang=lang)): for synset in wn.synsets(aln_term): for lemma in synset.lemma_names(): my_set_of_all_lemma_names.add(lemma) table.append([lang, len(set(wn.all_lemma_names(lang=lang))), len(my_set_of_all_lemma_names)]) print tabulate(table, headers=["Language code", "all_lemma_names()", "lemma_name.synset.lemma.lemma_names()"])
def langs(): return wordnet.langs()
for hyponym2 in hyponyms: if hyponym2 != hyponym: newlist = [] newlist.append(hyponym.name()) newlist.append(hyponym2.name()) pair = str(sorted(newlist)[0]) + ',' + str(sorted(newlist)[1]) if pair not in lemdict: lemdict[pair] = 0 for key, value in lemdict.items(): lemm = key.split(',') lemm1 = wn.synset(lemm[0]) lemm2 = wn.synset(lemm[1]) common = [] for lang in sorted(wn.langs()): lang1 = lemm1.lemma_names(lang) lang2 = lemm2.lemma_names(lang) for a in lang1: for b in lang2: if a == b: common.append(a) lemdict[key] += len(common) G = nx.Graph() for hyponym in hyponyms: G.add_node(hyponym.name()) for key, value in lemdict.items(): word = key.split(',') w1 = word[0]
def langs(self): return wn.langs()
# get all data from a word in a language defined by language variable from nltk.corpus import wordnet as wn import pprint from itertools import chain # get all languages sorted(wn.langs()) input = input("Enter word to get hyponyms and hypernyms: ") language = 'spa' lista = [] words = wn.synsets(input, lang=language) pp = pprint.PrettyPrinter(indent=4) for word in words: lista.append({ 'id': word.name(), 'word': word.lemma_names(language), 'meaning': word.definition(), 'hypernyms': [], 'hyponyms': [] }) currentHyponyms = [] currentHypernyms = [] for hyponym in word.hyponyms(): currentHyponyms.append(hyponym.lemma_names(language)) flat_list = [item for sublist in currentHyponyms for item in sublist]
for synset in a: print( synset.lemma_names()) """ female_names = names.words('female.txt') #print(female_names) cfd = nltk.ConditionalFreqDist((fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)) #cfd.plot() print(wordNet.synsets('motorcar')) print(wordNet.synset('car.n.01').lemma_names()) print(wordNet.synset('car.n.01').definition()) print("\nIDIOMAS: " + str(wordNet.langs()) + "\n") print(wordNet.synsets('jugar', lang='spa')) vero = wordNet.synsets('jugar', lang='spa') vero = wordNet.synsets('computer') print(vero) pc = wordNet.synsets('automobile') print("PC-> " + str(pc)) tiposComputadora = wordNet.synset('automobile.n.01') print("hyponyms-> " + str(tiposComputadora.hyponyms())) print("Hypernyms-> " + str(tiposComputadora.hypernyms()))
from textblob.wordnet import Synset import nltk from nltk.corpus import wordnet as wn # word = Word("pen") # print(word.synsets) # define = word.definitions # print(define) # # octopus = Synset('octopus.n.02') # shrimp = Synset('shrimp.n.03') # simi = octopus.path_similarity(shrimp) # print(simi) # # wiki = TextBlob("Python is a high-level, general-purpose programming language.") # print(wiki.tags) # print(wiki.noun_phrases) print(wn.synsets('pen')) print(wn.synset('penitentiary.n.01')) print(wn.synset('penitentiary.n.01').definition()) print(wn.synset('dog.n.01').lemmas()) print(sorted(wn.langs())) print(wn.synset('dog.n.01').lemma_names('ind')) dog = wn.synset('dog.n.01') print(dog.root_hypernyms()) print(wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')))
def __init__(self, language="eng"): if language not in wordnet.langs(): raise ValueError(f"Language {language} not one of {wordnet.langs()}") self.language = language
print("res_similarity: ", dog.res_similarity(cat, brown_ic)) print("res_similarity: ", dog.res_similarity(puppy, brown_ic)) # jcn_similarity print("jcn_similarity: ", dog.jcn_similarity(cat, brown_ic)) print("jcn_similarity: ", dog.jcn_similarity(puppy, brown_ic)) # lin_similarity print("lin_similarity: ", dog.lin_similarity(cat, brown_ic)) print("lin_similarity: ", dog.lin_similarity(puppy, brown_ic)) # Extend Open Multilingual Wordnet # 测试Extend Open Multilingual Wordnet的使用 # 打印所有支持的语言,其中cmn表示中文 print(wn.langs()) cmn = wn.lemmas(u"选择", lang='cmn') print("cmn: ", cmn) name = wn.lemma('choose.v.01.选择', lang='cmn').name() print("name: ", name) # “选择”这个词的所有同义词集 synsets = wn.synsets(u"选择", lang='cmn') print("synsets: ", synsets) name = wn.synsets(u"选择", lang='cmn')[0].lemmas()[0].name() print("name: ", name) # 一个同义词集的中文同义词集。 cmn_synset = wn.synset("choose.v.01").lemma_names('cmn') print("cmn_synset: ", cmn_synset)
with open(path, mode='w+', encoding='utf-8') as f: for key, synsets in lemma2fun.items(): print('\t'.join(list(key) + synsets), file=f) def read_possibility_dictionary(path): from ast import literal_eval lemma_cat2fun = dict() print(path) with open(path, mode='r', encoding='utf-8') as f: for l in f: val, funs = literal_eval(l) lemma_cat2fun[val] = funs return lemma_cat2fun if __name__ == '__main__': #lang2lemma2fun, _ = generate_possibility_dictionary(wn.langs()) #print(type(lang2lemma2fun)) lang2lemmacat2fun = generate_possibility_dictionary(wn.langs(), usecat=True) print(type(lang2lemmacat2fun)) print('Created dict') #for lang in wn.langs(): # write_possibility_dictionary('../data/possibility_dictionaries/wn2/{}.txt'.format(lang), lang2lemma2fun[lang]) for lang in wn.langs(): write_possibility_dictionary( '../data/possibility_dictionaries/wn2/{}.txt'.format(lang), lang2lemmacat2fun[lang]) print('Printed dict') print('Done.')
def print_synset(synset, langs): name = synset.name() examples = '|'.join(synset.examples()) d = synset.definition() print('\t'.join([ name, '\t'.join([synset_to_str(synset, l) for l in langs]), examples, d ])) # http://www.loc.gov/standards/iso639-2/php/code_list.php # two languages not found: zsm, qcn langs2 = [ 'en', 'sq', 'ar', 'bg', 'ca', 'zh', 'da', 'el', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hr', 'id', 'it', 'jp', 'nl', 'nn', 'nb', 'pl', 'pt', 'qc', 'sl', 'es', 'sv', 'th', 'ms' ] langs3 = [ 'eng', 'als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno', 'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm' ] langs = wn.langs() print('\t'.join(['synset', '\t'.join(langs2), 'examples', 'definitions'])) for s in wn.all_synsets(): print_synset(s, langs)
for synset in list(wn.all_synsets('n'))[:10]: print(synset) print(wn.synsets('dog', pos=wn.VERB)) print("*"*111) print(wn.synset('dog.n.01')) print(wn.synset('dog.n.01').definition()) print(len(wn.synset('dog.n.01').examples())) print(wn.synset('dog.n.01').examples()[0]) print(wn.synset('dog.n.01').lemmas()) a = [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()] print(a) print(wn.lemma('dog.n.01.dog').synset()) print("*"*111) print(sorted(wn.langs())) print(wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')) print(wn.synset('spy.n.01').lemma_names('jpn')) print(wn.synset('dog.n.01').lemma_names('ita')) print("*"*111) dog = wn.synset('dog.n.01') print(dog.hypernyms()) print(dog.hyponyms()) print(dog.member_holonyms()) print(dog.root_hypernyms()) print(wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01'))) print("*"*111) good = wn.synset('good.a.01') # print(good.antonyms())
print(stopwords.words('english')) print(not_stopwords(nltk.corpus.reuters.words())) print(not_stopwords(nltk.corpus.inaugural.words())) print(wordnet.synsets('cat')) print(wordnet.synsets('cat', pos=wordnet.VERB)) cat = wordnet.synset('cat.n.01') dog = wordnet.synset('dog.n.01') print(cat.definition()) print(len(cat.examples())) print(cat.lemmas()) print([str(lemma.name()) for lemma in cat.lemmas()]) print(wordnet.lemma('cat.n.01.cat').synset()) print(sorted(wordnet.langs())) print(cat.lemma_names('ita')) print(sorted(cat.lemmas('dan'))) print(sorted(cat.lemmas('por'))) print(len(wordnet.all_lemma_names(pos='n', lang='jpn'))) print(cat.hypernyms()) print(cat.hyponyms()) print(cat.member_holonyms()) print(cat.root_hypernyms()) print(cat.lowest_common_hypernyms(dog)) # NLP_w_Python Ch 1 print('=========================================') print('Looking up a Synset for a Word in WordNet') print('=========================================')