Example #1
0
def fun6():
    # 词汇工具:Toolbox和Shoebox
    # Toolbox文件由一些条目的集合组成, 其中每个条目由一个或多个字段组成。大多数字段都是可选的或
    # 重复的, 这意味着这个词汇资源不能作为一个表格或电子表格来处理
    # 条目包括一系列的“属性 - 值”对, 如('ps', 'V'), 表示词性是
    # 'V'(动词), ('ge', 'gag') 表示英文注释是'gag'。最后的3个配对包含一个罗托卡特语例句及其巴布亚皮钦语和英语的翻译
    from nltk.corpus import toolbox
    print toolbox.entries('rotokas.dic')
Example #2
0
translate['chien']
translate['jeter']

de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
translate['Hund']
translate['perro']

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])

from nltk.corpus import toolbox
toolbox.entries('rotokas.dic')

from nltk.corpus import wordnet as wn
wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synset('car.n.01').definition()
wn.synset('car.n.01').examples()

wn.synset('car.n.01').lemmas()
wn.lemma('car.n.01.automobile')
wn.lemma('car.n.01.automobile').synset()
wn.lemma('car.n.01.automobile').name()

wn.synsets('car')
for synset in wn.synsets('car'):
import nltk
from nltk.corpus import toolbox

print toolbox.entries('rotokas.dic')[0:2]
Example #4
0
print(swadesh.fileids())

fr2en = swadesh.entries(['fr','en'])
print(fr2en)

translate = dict(fr2en)
print(translate['chien'])

de2en = swadesh.entries(['de','en'])
es2en = swadesh.entries(['es','en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
print(translate['Hund'])
print(translate['perro'])

languages = ['en','de','nl','es','fr','pt','la']
for i in [139,140,141,142]:
    print(swadesh.entries(languages)[i])

'''
#词汇工具:ToolBox和Shoebox
from nltk.corpus import toolbox
print(toolbox.entries('rotokas.dic'))







('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere')
('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere')
('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare')
'''

# Shoebox and Toolbox Lexicons
# Perhaps the single most popular tool used by linguists for managing data is Toolbox, 
# previously known as Shoebox since it replaces the field linguist's traditional shoebox 
# full of file cards. Toolbox is freely downloadable from 
# http://www.sil.org/computing/toolbox/.

# A Toolbox file consists of a collection of entries, where each entry is made up of one or more fields.
# Most fields are optional or repeatable, which means that this kind of lexical resource cannot be treated as a table or spreadsheet.
# Here is a dictionary for the Rotokas language. We see just the first entry, for the word kaa meaning "to gag":
from nltk.corpus import toolbox
toolbox.entries('rotokas.dic')  # @UndefinedVariable
'''
[('kaa', [('ps', 'V'), ('pt', 'A'), ('ge', 'gag'), ('tkp', 'nek i pas'),
('dcsv', 'true'), ('vx', '1'), ('sc', '???'), ('dt', '29/Oct/2005'),
('ex', 'Apoka ira kaaroi aioa-ia reoreopaoro.'),
('xp', 'Kaikai i pas long nek bilong Apoka bikos em i kaikai na toktok.'),
('xe', 'Apoka is gagging from food while talking.')]), ...]
'''
# Entries consist of a series of attribute-value pairs, 
# like ('ps', 'V') to indicate that the part-of-speech is 'V' (verb), 
# and ('ge', 'gag') to indicate that the gloss-into-English is 'gag'. 
# The last three pairs contain an example sentence in Rotokas and its translations into 
# Tok Pisin and English.

# The loose structure of Toolbox files makes it hard for us to do much more with them at this stage. XML provides a powerful way to process this kind of corpus and we will return to this topic in 11..
# The Rotokas language is spoken on the island of Bougainville, Papua New Guinea. This lexicon was contributed to NLTK by Stuart Robinson. 
import nltk
from nltk.corpus import toolbox
print toolbox.entries('rotokas.dic')[0:2]
Example #7
0
            
syllable = ["N", "IH0", "K", "S"]
word = [word for word, pron in entries if pron[-4:] == syllable]
print("syllable", word)

def stress(pron):
    return [char for phone in pron for char in phone if char.isdigit()]

stress_word = [w for w, pron in entries if stress(pron) == ["0", "1", "0", "2", "0"]]
print("stress_word", stress_word)


#-----------------------

from nltk.corpus import toolbox
entries = toolbox.entries("rotokas.dic")
print(entries)

#-----------------------

from nltk.corpus import wordnet as wn
lemmas = wn.lemmas("car")

synsets = wn.synsets("motorcar")
for synset in wn.synsets("car"):
    print(synset.lemma_names)

print("lemmas", wn.lemmas("car"))

lemmas = wn.synset("car.n.01").lemmas
lemma_name = wn.synset("car.n.01").lemma_names
Example #8
0
def toolbox():
    toolbox.entries('rotokas.dic')