def compareWordlist(): swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print swadesh.entries(languages)[i]
# 斯瓦迪士核心词列表 from nltk.corpus import swadesh print(swadesh.fileids()) print(swadesh.words('en')) # entries()方法中指定一个语言链表来访问多语言中的同源词 fr2en = swadesh.entries(['fr', 'en']) print(fr2en)
# In[11]: from nltk.corpus import swadesh #idiomas disponibles print(swadesh.fileids()) # In[12]: print(swadesh.words('en')) # Hacemos una traducción del **frances** al **español** # In[13]: fr2es = swadesh.entries(['fr', 'es']) print(fr2es) # In[14]: translate = dict(fr2es) translate['chien'] # In[15]: translate['jeter'] # # WordNet # ## Referencias #
# hier you see a one-pair dictionary with the language combination German to Englisch and vice a versa: from nltk.corpus import swadesh de_to_en = swadesh.entries(["de", "en"]) en_to_de = swadesh.entries(["en", "de"]) translate = dict(de_to_en) translate1 = dict(en_to_de) translate.update(dict(translate1)) print(translate["Hund"]) print(translate["dog"])
from nltk.corpus import swadesh from tkinter import * en2de = swadesh.entries(['en', 'de']) # English-German translate = dict(en2de) def translate_word(): eng_word = english_word.get().lower() german_words = translate[eng_word] list1.insert(END, german_words) window = Tk() window.wm_title("Translater") #Label for English Word and German word l1 = Label(window, text="English Word") l1.grid(row=1, column=0) l2 = Label(window, text="German Word") l2.grid(row=1, column=2) #Entry text english_word = StringVar() e1 = Entry(window, textvariable=english_word) e1.grid(row=1, column=1) list1 = Listbox(window, height=6, width=15) list1.grid(row=2, column=3, columnspan=3)
for target in ['men', 'women', 'people'] if w.lower() == target) cfd.tabulate() # In[16]: cfd.plot() # #### 6. In the discussion of comparative wordlists, we created an object called translate which you could look up using words in both German and Italian in order to get corresponding words in English. What problem might arise with this approach? Can you suggest a way to avoid this problem? # In[17]: from nltk.corpus import swadesh de2en = swadesh.entries(['de', 'en']) it2en = swadesh.entries(['it', 'en']) translate2 = dict(de2en) translate2.update(dict(it2en)) len(translate2) # In[18]: translate2['bianco'] # In[19]: translate2['Hund']
from nltk.corpus import stopwords stopwords.words('english') def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w.lower() not in stopwords] return len(content) / len(text) content_fraction(nltk.corpus.reuters.words()) content_fraction(nltk.corpus.inaugural.words()) # Translator from nltk.corpus import swadesh languages = ['en', 'ro', 'es', 'fr', 'pt', 'la'] for i in [100, 141, 143]: print swadesh.entries(languages)[i] # Wordnet #dictionary of English from nltk.corpus import wordnet as wn wn.synsets('motorcar') wn.synset('car.n.01').lemma_names wn.synset('car.n.01').definition for synset in wn.synsets('car')[1:3]: print synset.lemma_names # Depth of a synset wn.synset('whale.n.02').min_depth() wn.synset('vertebrate.n.01').min_depth() wn.synset('walk.v.01').entailments() #Walking involves stepping
# Instead of the list of tuples, we can access the cmu dictionary as a # python dictionary prondict = nltk.corpus.cmudict.dict() prondict['fire'] # 4.3 Comparative wordlist # # The swadesh comparative word list is a list of 200 common words in # multiple languages. from nltk.corpus import swadesh swadesh.fileids() swadesh.words('en') # Use the word list to construct a translator fr2en = swadesh.entries(['fr', 'en']) translate = dict(fr2en) translate['chien'] translate['jeter'] # We can also add in extra language by updating our dictionary, german # and spanish are added. de2en = swadesh.entries(['de', 'en']) es2en = swadesh.entries(['es', 'en']) translate.update(dict(de2en)) translate.update(dict(es2en)) # Spanish for dog translate['perro'] # German for dog translate['Hund']
# 在词典中寻找单词的发音 text = ['natural', 'language', 'processing'] [ph for w in text for ph in prondict[w][0]] # 加[0]是因为natural有两个发音,取其中一个就好了 [ph for w in text for ph in prondict[w]] prondict['natural'] # P70 4.3. 比较词表(Swadesh wordlists),包括几种语言的约200个常用词的列表,可以用于比较两个语言之间的差别,也可以用于不同语言的单词翻译 from nltk.corpus import swadesh swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] de2en = swadesh.entries(['de', 'en']) translate.update(dict(de2en)) es2en = swadesh.entries(['es', 'en']) translate.update(dict(es2en)) translate['jeter'] translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print(swadesh.entries(languages)[i])
tree.member_meronyms() # 包含tree的部分--部分词 tree.part_meronyms() # 子结构 --部分词 tree.substance_meronyms() # 物质-部分词 tree.member_holonyms() # 包含树的 --上位次,这里一般是森林 tree.part_holonyms() # 子结构--上位词,这里肯定为空 tree.substance_holonyms() # 物质--上位词,一般也为空,因为这个集合不想交 """ 6. ○在比较词表的讨论中, 我们创建了一个对象叫做translate, 通过它你可以使用德语 和意大利语词汇查找对应的英语词汇。这种方法可能会出现什么问题?你能提出一个办 法来避免这个问题吗? 如何知道输入的语言是德语还是意大利语呢,特别是在意大利语与德语词汇相同当语义不同的时候 """ # 第六题 from nltk.corpus import swadesh fr2en = swadesh.entries(['fr', 'en']) translate = dict(fr2en) translate.update(swadesh.entries(['it', 'en'])) # 可能会存在的问题: 如何知道输入的语言是德语还是意大利语呢,特别是在意大利语与德语词汇相同当语义不同的时候 """ 7. ○根据Strunk和 White 的《 Elements of Style》, 词 however在句子开头使用是“ in wh atever way” 或“ to whatever extent” 的意思,而没有“ nevertheless” 的意思。他们给 出了正确用法的例子: However you advise him, he will probably do as he thinks bes t.( http://www.bartleby.com/141/strunk3.html)。使用词汇索引工具在我们一直在思考的 各种文本中研究这个词的实际用法。也可以看 LanguageLog发布在 http://itre.cis.upenn. edu/~myl/languagelog/archives/001913.html上的 “ Fossilized prejudices about ‘ however’”。 """ """ 8. ◑在名字语料库上定义一个条件频率分布,显示哪个首字母在男性名字中比在女性名字 中更常用(见图 2-7)。 """
def compare_germanic_and_latin_words(): languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'it', 'la'] for i in [139, 140, 141, 142]: print(swadesh.entries(languages)[i])
from nltk.corpus import swadesh from nltk.corpus import wordnet as wn # print(swadesh.fileids()) es2en = swadesh.entries(['es', 'en']) translate = dict(es2en) words = ['cenizas', 'nieve', 'hincharse'] for w in words: print(w, " = ", translate[w]) # for w in translate: # print(w) computer = wn.synsets('computer')[0] print("The hypernyms of computer are ", computer.hypernyms()) print("The hyponyms of computer are ", computer.hyponyms()) automobile = wn.synsets('automobile')[0] print("The meronyms of automobile are ", automobile.part_meronyms()) bird = wn.synsets('bird')[0] print("The holonyms of bird are ", bird.member_holonyms())
# -*-coding:utf-8-*- """ This module is an example for Swadesh corpus retrieval """ import re import numpy as np from nltk.corpus import swadesh __author__ = "besnier" germanic_languages = ["en", "de", "nl"] roman_languages = ["fr", "es", "it"] alphabet = list('azertyuiopqsdfghjklmwxcvbn') to_aligner_ger = swadesh.entries(germanic_languages) to_aligner_rom = swadesh.entries(roman_languages) def vocabulary_retrieve(languages, normalize): """ Load and normalize corpora according to chosen languages :param languages: :param normalize: :return: """ to_align = swadesh.entries(languages) normalised_words = [] characters = set() for i, mots in enumerate(to_align): normalised_words.append([])
import nltk from nltk.corpus import swadesh # swadesh dict print swadesh.fileids() print swadesh.words('en') # a samlple dict fr2en = swadesh.entries(['fr','en']) print fr2en[:5] translate = dict(fr2en) print translate['chien'] print translate['jeter'] # compare germ with latin language = ['en','de','nl','es','fr','pt','la'] for i in [139,140,141,142]: print swadesh.entries(language)[i]
def translate(frm, to, word): from nltk.corpus import swadesh frm2to = swadesh.entries([frm, to]) # from -> to translate = dict(frm2to) return translate[word]
words = cfd[template].keys() wordlist = ' '.join(words) print(template, wordlist[:70] + "...") prondict = nltk.corpus.cmudict.dict() text = ['natural', 'language', 'processing'] [ph for w in text for ph in prondict[w][0]] #比较词典 from nltk.corpus import swadesh swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]:
def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w.lower() not in stopwords] return len(content) / len(text) content_fraction(nltk.corpus.reuters.words()) content_fraction(nltk.corpus.inaugural.words()) # Translator from nltk.corpus import swadesh languages = ['en', 'ro', 'es', 'fr', 'pt', 'la'] for i in [100, 141, 143]: print swadesh.entries(languages)[i] # Wordnet #dictionary of English from nltk.corpus import wordnet as wn wn.synsets('motorcar') wn.synset('car.n.01').lemma_names wn.synset('car.n.01').definition for synset in wn.synsets('car')[1:3]: print synset.lemma_names # Depth of a synset wn.synset('whale.n.02').min_depth() wn.synset('vertebrate.n.01').min_depth()
lang2country = {languages[i] : country_codes[i] for i in range(len(languages))} #Country_codes converted to pandas dataframe so as to use them in maps country_codes = pd.DataFrame(country_codes) #list of words of different languages stored x = [] #index of the particular language in the above list indx = dict() i = 0 for lang in languages: a = swadesh.entries(['en',lang]) #storing words of each language in list x.append(swadesh.entries([lang])) #storing index of each language indx[lang] = i i = i + 1 country_codes['lang'] = languages #Check box on sidebar to be unchecked if user wants to see map if st.sidebar.checkbox('Check to see country\'s language code' ,False): st.subheader('Hover your mouse over countries to know the language code of various countries') st.markdown('Only blue coloured countries are available for translation') #Map with country_codes being used as locations fig = px.choropleth(country_codes,locations =country_codes[:][0] ,
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import swadesh swadesh.fileids() swadesh.words("en") fr2en = swadesh.entries(["fr", "en"]) fr2en translate = dict(fr2en) translate["chien"] translate["jeter"]
"""2. Використовуючи компаративний словник знайти для німецької, італійської та англійської мов близькі слова. Чи можуть отримані результати використовуватися для здійснення перекладу?""" from nltk.corpus import swadesh print(swadesh.entries(['de', 'en', 'it'])) # ('lang', 'long', 'lungo') - длинный # 'Nase', 'nose', 'naso' - нос # 'Name', 'name', 'nome' - имя
__author__ = 'lizhifeng' from nltk.corpus import swadesh print swadesh.fileids() print swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) print fr2en translate = dict(fr2en) print translate["chien"]
# [Synset('forest.n.01')] wn.synset('electrode.n.01').part_holonyms() # @UndefinedVariable # [Synset('battery.n.02'), Synset('electrolytic_cell.n.01'), Synset('electronic_equipment.n.01'), Synset('tube.n.02')] wn.synset('terminal.n.02').part_holonyms() # @UndefinedVariable # [Synset('battery.n.02'), Synset('electrical_device.n.01')] wn.synset('calcium_carbonate.n.01').substance_holonyms() # @UndefinedVariable # [Synset('calcite.n.01'), Synset('chalk.n.01')] # 6.☼ In the discussion of comparative wordlists, # we created an object called translate which you could look up using words # in both German and Spanish in order to get corresponding words in English. # What problem might arise with this approach? # Can you suggest a way to avoid this problem? from nltk.corpus import swadesh translate = dict() de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] # 'dog' translate['perro'] # 'dog' # Problem: 查询值没有做处理,不够robust. 例如查询translate['hund']就不能得到正确结果 # Solution: 构造好(key,value)pair,再update一次,可根据需要适当对键值进行处理,比如忽略大小写,单复数变换,stem之类的。 # 重复键值对会保持下来 translate.update(dict( (key.lower(),value) for key,value in de2en)) translate.update(dict( (key.lower(),value) for key,value in es2en)) # 7.☼ According to Strunk and White's Elements of Style, the word however, # used at the start of a sentence, means "in whatever way" or "to whatever extent",
import nltk from nltk.corpus import swadesh en2ca = swadesh.entries(['en','ca']) translate = dict(en2ca) print translate['dog']
from nltk.corpus import swadesh print(swadesh.fileids()) # prints out the language two-letter-codes. print() en_to_de = swadesh.entries(["en", "de"]) print(en_to_de) # prints out a list of English words translated into the German language. print() translate = dict(en_to_de) print(translate["dog"]) # prints out a German translation of "dog".