def fun5(): # 比较词表 # 表格词典的另一个例子是比较词表。NLTK中包含了所谓的斯瓦迪士核心词列表(Swadesh # wordlists),包括几种语言的约200个常用词的列表。语言标识符使用ISO639双字母码 from nltk.corpus import swadesh print swadesh.fileids() print swadesh.words('en') # 可以通过使用entries() # 方法来指定一个语言链表来访问多语言中的同源词。而且, 还可以把它转换成一 # 个简单的词典 fr2en = swadesh.entries(['fr', 'en']) print fr2en translate = dict(fr2en) print translate['chien'] print translate['jeter']
def _calculate_languages_ratios(self, text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in swadesh.fileids(): stopwords_set = set(swadesh.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def compareWordlist(): swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print swadesh.entries(languages)[i]
def test_swadesh(model, lang) -> Tuple[Optional[float], Optional[List]]: swadesh_langs = set(swadesh.fileids()) if lang in swadesh_langs: logging.info('Testing model on Swadesh list for {}...'.format(lang)) # some entries in the swadesh list have multiple words # because they include contextual definitions # so we need to only take the first word words = swadesh.words(fileids=lang) words = [word.split()[0].casefold() for word in words] accuracy, errors = test_accuracy(words, model) else: logging.error('No Swadesh corpus for "{}"'.format(lang)) accuracy = None errors = None return accuracy, errors
# An example of a tabular lexicon is the comparative wordlist. NLTK includes so-called Swadesh wordlists, lists of about 200 common words in several languages. The Swadesh list is used in the quantitative assessment of the genealogical relatedness of languages. from nltk.corpus import swadesh print(swadesh.fileids()) # prints out the language identifiers (two-letter code). print() # prints out an empty line. print(swadesh.words("de")) # prints out 200 common German words from swadesh.
for name in names.words(fileid)) cfd.plot() entries = nltk.corpus.cmudict.entries() len(entries) for entry in entries[42371:42379]: print(entry) syllable = ['N', 'IHO', 'K', 'S'] [word for word, pron in entries if pron[-4:] == syllable] [w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n'] from nltk.corpus import swadesh swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) es2en = swadesh.entries(['es', 'en']) translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro']
# 在词典中寻找单词的发音 text = ['natural', 'language', 'processing'] pron_list = [ph for w in text for ph in prondict[w][0]] print("word pronoun list= ", pron_list) # 加[0]是因为natural有两个发音,取其中一个就好了 pron_list = [ph for w in text for ph in prondict[w]] print("'natural' pronoun list= ", pron_list) print("prondict['natural']=", prondict['natural']) # P70 2.4.3 比较词表(Swadesh wordlists) # 包括几种语言的约200个常用词的列表,可以用于比较两个语言之间的差别,也可以用于不同语言的单词翻译 from nltk.corpus import swadesh print("swadesh.fileids()= ", swadesh.fileids()) print("swadesh.words('en')= ", swadesh.words('en')) fr2en = swadesh.entries(['fr', 'en']) print("fr2en= ", fr2en[:13]) translate = dict(fr2en) print("translate= ", translate) print("translate['chien']= ", translate['chien']) de2en = swadesh.entries(['de', 'en']) translate.update(dict(de2en)) es2en = swadesh.entries(['es', 'en']) translate.update(dict(es2en)) print("translate= ", translate) print("translate['jeter']= ", translate['jeter'])
word = cfdist[word].max() text = nltk.corpus.genesis.words('english-kjv.txt') bigrams = nltk.bigrams(text) cfd = nltk.ConditionalFreqDist(bigrams) >>> cfd['living'] FreqDist({'creature': 7, 'thing': 4, 'substance': 2, ',': 1, '.': 1, 'soul': 1}) >>> generate_model(cfd, 'living') living creature that he said , and the land of the land of the land # basically, we get a conditional frequency distribution of bigrams so we can see which word pairs occur frequently # then the function will reset the context word (starting with seed word) every time = predictive text # symbols in the CMU Pronouncing Dictionary are from the Arpabet # Swadesh wordlists: lists of about 200 common words in several languages. Access like below: from nltk.corpus import swadesh >>> swadesh.fileids() ['be', 'bg', 'bs', 'ca', 'cs', 'cu', 'de', 'en', 'es', 'fr', 'hr', 'it', 'la', 'mk', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sw', 'uk'] # can create a simple dictionary for these words >>> fr2en = swadesh.entries(['fr', 'en']) >>> fr2en [('je', 'I'), ('tu, vous', 'you (singular), thou'), ('il', 'he'), ...] >>> translate = dict(fr2en) >>> translate['chien'] 'dog' >>> translate['jeter'] 'throw' >>> de2en = swadesh.entries(['de', 'en']) # German-English >>> es2en = swadesh.entries(['es', 'en']) # Spanish-English >>> translate.update(dict(de2en)) # this is basically just adding onto the previous translate dictionary with french in it >>> translate.update(dict(es2en))
__author__ = 'lizhifeng' from nltk.corpus import swadesh print swadesh.fileids() print swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) print fr2en translate = dict(fr2en) print translate["chien"]
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import swadesh swadesh.fileids() swadesh.words("en") fr2en = swadesh.entries(["fr", "en"]) fr2en translate = dict(fr2en) translate["chien"] translate["jeter"]
import nltk #Using corpus swadesh from nltk.corpus import swadesh #Heading to be provided with the help of streamlit st.title('Translate English words to various languages') st.markdown('If using mobile please tap on top left arrrow button to visualize sidebar') st.sidebar.title('Translate English words to various languages') st.markdown('This is a dashboard where you can hover over the country in a map to know the language codes of that place\'s native language and then translate English words') #Languages whose words are present in swadesh corpus languages = swadesh.fileids() #iso_alpha codes of countries to use them in form of locations in plotly_express map country_codes = ['BLR' , 'BGR' , 'BIH' , 'AND' ,'CZE' , 'MNE','DEU' , 'USA' , 'ESP' , 'FRA' , 'HRV' , 'CHE', 'ITA' , 'MKD' , 'ABW', 'POL', 'PRT' , 'ROU' , 'RUS' ,'SVK' , 'SVN' , 'SRB', 'KEN' ,'UKR' ] country_names = ['Belarus' , 'Bulgaria' , 'Bosnia and Herzengove' , 'Andorra' , 'Czech Republic', 'Montenegro' ,'Germany','USA' , 'Spain' , 'France' , 'Croatia' ,'Switzerland' ,'Italy' ,'Republic of North Macedonia','Aruba','Poland' ,'Portugal' ,'Romania' , 'Russia' , 'Slovakia' , 'Slovenia', 'Serbia' , 'Kenya' , 'Ukraine'] lang2country_name = {languages[i] : country_names[i] for i in range(len(languages))} lang2country = {languages[i] : country_codes[i] for i in range(len(languages))} #Country_codes converted to pandas dataframe so as to use them in maps
from nltk.corpus import swadesh print(swadesh.fileids(), '\n') print(swadesh.words('en'), '\n') fr2en = swadesh.entries(['fr', 'en']) print(fr2en, '\n') translate = dict(fr2en) print(translate['chien']) print(translate['jeter'], '\n') de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) print(translate['Hund']) print(translate['perro'], '\n') languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print(swadesh.entries(languages)[i])