Ejemplo n.º 1
0
def demo():
    from nltk.corpus import toolbox
    from itertools import islice

    lexicon = toolbox.xml('rotokas.dic')
    print 'first field in fourth record:'
    print lexicon[3][0].tag
    print lexicon[3][0].text
    
    print '\nfields in sequential order:'
    for field in islice(lexicon.find('record'), 10):
        print field.tag, field.text

    print '\nlx fields:'
    for field in islice(lexicon.findall('record/lx'), 10):
        print field.text
Ejemplo n.º 2
0
def demo():
    from nltk.corpus import toolbox
    from itertools import islice

    lexicon = toolbox.xml('rotokas.dic')
    print 'first field in fourth record:'
    print lexicon[3][0].tag
    print lexicon[3][0].text

    print '\nfields in sequential order:'
    for field in islice(lexicon.find('record'), 10):
        print field.tag, field.text

    print '\nlx fields:'
    for field in islice(lexicon.findall('record/lx'), 10):
        print field.text
Ejemplo n.º 3
0
from collections import defaultdict

abbreviate = defaultdict(lambda: 'OTH')
for speaker, _ in top5:
    abbreviate[speaker] = speaker[:4]
speaker_seq2 = [abbreviate[speaker] for speaker in speaker_seq]
cfd = nltk.ConditionalFreqDist(nltk.bigrams(speaker_seq2))
cfd.tabulate()

# 11.4.4 使用ElementTree访问Toolbox的数据
from nltk.corpus import toolbox

# 访问lexicon对象的内容的两种方法
# 1) 通过索引
# 索引访问:lexicon[3]返回3号条目(从0开始算起的第4个条目),lexicon[3][0]返回它的第一个字段
lexicon = toolbox.xml('rotokas.dic')
lexicon[3][0]
lexicon[3][0].tag
lexicon[3][0].text

# 2) 通过路径
# 路径访问:'record/lx'的所有匹配,并且访问该元素的文本内容,将其规范化为小写
[lexeme.text.lower() for lexeme in lexicon.findall('record/lx')]

# Toolbox数据是XML格式。
import sys
from nltk.util import elementtree_indent
from xml.etree.ElementTree import ElementTree

elementtree_indent(lexicon)
tree = ElementTree(lexicon[3])
Ejemplo n.º 4
0
# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
corresponds to 
12.3.1   Accessing Toolbox Data
in http://nltk.sourceforge.net/lite/doc/en/data.html
"""

from nltk.corpus import toolbox

lexicon = toolbox.xml('rotokas.dic')

sum_size = num_entries = 0
for entry in lexicon.findall('record'):
    num_entries += 1
    sum_size += len(entry)
print sum_size/num_entries


from nltk.etree.ElementTree import ElementTree
import sys
fourth_entry = lexicon.findall('record')[3]
tree = ElementTree(fourth_entry)
tree.write(sys.stdout)
Ejemplo n.º 5
0
# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
corresponds to 
12.3.1   Accessing Toolbox Data
in http://nltk.sourceforge.net/lite/doc/en/data.html
"""

from nltk.corpus import toolbox

lexicon = toolbox.xml('rotokas.dic')
lexemes = []
for lexeme in lexicon.findall('record/lx'):
    normalised_lexeme = lexeme.text.lower()
    lexemes.append(normalised_lexeme)

# list comprehension approach
lexemes2 = [lexeme.text.lower() for lexeme in lexicon.findall('record/lx')]

##if lexemes != lexemes2:
##    print 'error two lists not equal'
##else:
##    print repr(lexemes)

import re